diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..37ec93a --- /dev/null +++ b/LICENSE @@ -0,0 +1,191 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of +this License; and +You must cause any modified files to carry prominent notices stating that You +changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification within +third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index f03c4b1..d7cbedf 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,69 @@ SIMDCompressionAndIntersection -============================== +====================== -A C++ library to compress and intersect sorted lists of integers using SIMD instructions +As the name suggests, this is a C/C++ library for fast +compression and intersection of lists of sorted integers using +SIMD instructions. The library focuses on innovative techniques +and very fast schemes, with particular attention to differential +coding. It introduces new SIMD intersections schemes such as +SIMD Galloping. + +Authors: Leonid Boystov, Nathan Kurz and Daniel Lemire +With some contributions from Owen Kaser and others. + +Simple demo +------------------------ + +Check out example.cpp + +You can run it like so: + +make example +./example + +Usage +------------------------ + +make +./unit + +To run tests, you can do + +./testcodecs + +(follow the instructions) + + + +Comparison with the FastPFOR C++ library +----------------------------------------- + +The FastPFOR C++ Library available at https://github.com/lemire/FastPFor +implements some of the same compression schemes except that +it is not optimized for the compression of sorted lists of integers. + + +Licensing +------------------------ + +Apache License, Version 2.0 + +As far as the authors know, this work is patent-free. + +Requirements +------------------------ + +A recent GCC (4.7 or better), Clang or Intel compiler. + +A processor support AVX (Intel or AMD). + +Tested on Linux and MacOS. It should be portable to Windows and other platforms. + + + + +For advanced benchmarking, please see + +advancedbenchmarking/README.md + +where there is additional information. diff --git a/advancedbenchmarking/README.md b/advancedbenchmarking/README.md new file mode 100644 index 0000000..bfc506d --- /dev/null +++ b/advancedbenchmarking/README.md @@ -0,0 +1,74 @@ +Advanced benchmarking +===================== + + +The goal of this subdirectory is to provide some utilities that are +useful for "advanced benchmarking". + + +* simplesynth is a utility that generates a file containing randomly generated arrays. +* compress is a utility that takes a file containing uncompressed (regular) arrays and generates compressed arrays. +* uncompress is a utility that takes a file containing compressed arrays and generates uncompressed arrays. +* budgetedtest intersect is a program to benchmark decompression speed together with fast intersections +* entropy can be used to compute the entropy of the deltas given a set of (uncompressed) posting lists + +Sample usage +------------- + +go to root directory in the project +$ make compress uncompress simplesynth +$ ./simplesynth -m 100 -N 10000 -R 20 test.bin +$ ./compress -s varint test.bin ctest.bin +$ ./uncompress ctest.bin recovered.bin + + +Working with intersections +-------------------------- + +First, you must get the clueweb09 dataset (or a similar data set). You can get +the clueweb09 data we use from http://boytsov.info/datasets/clueweb09gap/ + +You need a posting list file in a flat format: a sequence of arrays stored as a 32-bit unsigned integer +indicating length followed by a corresponding sequence of 32-bit unsigned integers. + +You can quickly generate a test file from a synthetic model: + +./simplesynth -N 100000 -m 100 test.bin + +You can construct such a data file from the clueweb09 dataset made available by Leonid +Boystov (http://boytsov.info/datasets/clueweb09gap/). For Nathan and Leo, this +is available on the test machine at /home/data/clueweb/Uncompressed/flat.bin (October 2013). + +You also need a query log file. It needs to be a text file where each row contains +a sequence of integers (separated by white space). Each row corresponds to a query: each +integer is a the index of the corresponding posting list. For example, the row "0 1 2" corresponds +to the first 3 posting lists. Leonid Boytsov prepared such a query log from the infamous +AOL web query log file, matching the clueweb09 data set. (By convention, the number "1000000" +corresponds to an "unknown" posting list because there are 1000000 different terms in the +clueweb09 data set that Leo indexed: a more general convention is that an out-of-range +index can be ignored.) For Nathan and Leo, these files +are available under /home/data/AOL/ in the test machine (October 2013). + +If you want, you can just make one up by creating a text file and entering, on each line, a series of distinct integers (say between 0 and 100) separated by spaces. + +You can run a test as follows: + +./budgetedtest /home/data/clueweb/Uncompressed/flat.bin /home/data/AOL/user-ct-test-collection-01.id + -i simd -b 24 -s varint +-s specify the compression scheme (if none: default on uncompressed) +-b is is the memory budget in gigabytes. +-i allows you to specify an intersection routine (default on a fast SIMD-based intersection routine). +-o is to include one-word queries (they don't participate in intersections, only in decoding) +-q allows you to specify that you just want the final report (warning: the program can take a long time to complete) +-l allows you to limit the number of queries (e.g., -l 1000) +-p allows you to partition the postings during compression and decompression (param: number of partitions) +-k to test Skipping (must provide a gap size param between 1 and 31) +-B to activate bitmap mode (recommended values are in the range 8 to 32), when set to 0 in conjunction with compression, it means 'automatic' +-d to dump complete statistics + + +Similarly, if you want the corresponding entropy, you can run + +./entropy /home/data/clueweb/Uncompressed/flat.bin /home/data/AOL/user-ct-test-collection-01.id + +It has similar flags as budgetedtest. diff --git a/advancedbenchmarking/extra/scriptbudget.sh b/advancedbenchmarking/extra/scriptbudget.sh new file mode 100755 index 0000000..f962a76 --- /dev/null +++ b/advancedbenchmarking/extra/scriptbudget.sh @@ -0,0 +1,14 @@ + +for inter in simd galloping scalar; do +echo "%%%% Trying out uncompressed first" $inter +./budgetedtest /home/data/clueweb/Uncompressed/flat.bin /home/data/AOL/user-ct-test-collection-01.id -i $inter -q -b 12 -l 5000 +for scheme in bp copy ibp ironpeter kurzbp simdbp2 simdbp4 simdbpm simdbpr simdfastpfor2 simdfastpfor4 simdfastpform simdfastpforr simdibp2 simdibp4 simdibpm simdibpr simdifastpfor2 simdifastpfor4 simdifastpform simdifastpforr simdvarintgb4 simdvarintgbr simdvarintgbr2 simdvarintgbrm varintgbr varintr; do +echo "######## testing" $inter $scheme +./budgetedtest /home/data/clueweb/Uncompressed/flat.bin /home/data/AOL/user-ct-test-collection-01.id -i $inter -s $scheme -q -b 12 -l 5000 +for p in 8 16 32; do +echo "##### number of partitions = " $p +./budgetedtest /home/data/clueweb/Uncompressed/flat.bin /home/data/AOL/user-ct-test-collection-01.id -i $inter -s $scheme -q -b 12 -l 5000 -p $p +echo +done +done +done diff --git a/advancedbenchmarking/extra/scriptbudget_bitmap.sh b/advancedbenchmarking/extra/scriptbudget_bitmap.sh new file mode 100755 index 0000000..972ca11 --- /dev/null +++ b/advancedbenchmarking/extra/scriptbudget_bitmap.sh @@ -0,0 +1,13 @@ +make entropy budgetedtest +./entropy /home/data/clueweb/Uncompressed/flat.bin /home/data/AOL/user-ct-test-collection-01.id -b 12 -l 1000 +for inter in simd galloping; do +for scheme in kurzbp simdfastpforr fastpfor varintr ; do +echo "######## testing" $inter $scheme +for B in 8 32 ; do +echo "# B = " $B +./budgetedtest /home/data/clueweb/Uncompressed/flat.bin /home/data/AOL/user-ct-test-collection-01.id -i $inter -s $scheme -q -b 12 -l 1000 -B $B +echo +done +done +done +./email.sh diff --git a/advancedbenchmarking/include/budgetedpostingcollector.h b/advancedbenchmarking/include/budgetedpostingcollector.h new file mode 100644 index 0000000..ed6aa4d --- /dev/null +++ b/advancedbenchmarking/include/budgetedpostingcollector.h @@ -0,0 +1,222 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +#ifndef BUDGETEDPOSTINGCOLLECTOR_H_ +#define BUDGETEDPOSTINGCOLLECTOR_H_ + + +#include "common.h" +#include "util.h" +#include "maropuparser.h" +#include "intersection.h" + +/* + * This class reads postings until their total size is below a threshold (memory budget). + * It encapsulates an out-of-order posting retrieval. It is not straightforward, because + * the posting file format lacks a dictionary. + */ +class BudgetedPostingCollector { +public: + BudgetedPostingCollector(const string& postFileName, size_t memBudget) : + seenOffsets(0), readPostings(), + gapReader(postFileName), memBudget(memBudget), memUsed(0) + { + if (!gapReader.open()) { + throw runtime_error(" could not open " + postFileName + " for reading..."); + } + readOffsets(); + } + + ~BudgetedPostingCollector() { + gapReader.close(); + } + + /* + * checks whether the posting list index is between 0 and getMaxPostQty() + */ + bool valid(uint32_t postId) { + return postId < seenOffsets.size(); + } + + + /* + * Returns false, when the memory budget is exhausted. + * Throws an exception in case of an error. It is assumed + * that postId is a valid identifier (call valid() to check). + */ + bool loadOnePost(uint32_t postId) { + if(readPostings.find(postId) != readPostings.end()) + return true;// already loaded, nothing to do + assert(valid(postId)); + gapReader.setPos(seenOffsets[postId]); + if (!gapReader.loadIntegers(readPostings[postId])) { + stringstream err; + err << "Cannot read posting list, id = id" << postId; + throw runtime_error(err.str()); + } + size_t qty = readPostings[postId].size(); + readPostings[postId].shrink_to_fit();// may or may not be useful + if(qty == 0) cout << "[WARNING] Empty posting list found." < memBudget) { + readPostings.erase(postId); + return false; + } + memUsed += addMem; + return true; + } + /* + * Returns false, when the memory budget is exhausted. + * Throws an exception in case of an error. + * It is assume that all ideas are valid (call valid()). + * + * this is basically a convenience wrapper around loadOnePost. + */ + bool loadPostings(const vector& postIds) { + for (const auto id: postIds) { + if (!loadOnePost(id)) return false; + } + return true; + } + + /* + * how many posting lists are there to be loaded? + * this is not necessarily the total number of buffered + * posting lists. + */ + size_t getMaxPostQty() const { return seenOffsets.size(); } + + + /** + * This finds the largest and smallest document ID from a set of queries. + */ + pair findDocumentIDRange(const vector>& allPostIds) { + uint32_t largestpossible = numeric_limits::max(); + uint32_t smallestpossible = 0; + pair answer = make_pair(largestpossible,smallestpossible); + for (const vector & qids: allPostIds) { + for (uint32_t id: qids) { + vector& onePost = getOnePost(id); + if(onePost.empty()) continue; + if(onePost.front() < answer.first) answer.first = onePost.front(); + if(onePost.back() > answer.second) answer.second = onePost.back(); + } + } + assert(answer.first<=answer.second); + return answer; + } + + /** + * Given a set of queries, this will find the size of the largest + * posting list corresponding to one of the IDs being queried. + * Before calling this function, the data should have be + * pre-loaded using the function loadOnePost or loadPostings; + * an exception is thrown otherwise. + */ + size_t findMaxPostingSize(const vector>& allPostIds) { + size_t MaxPostingSize(0); + for (const vector & qids: allPostIds) { + for (uint32_t id: qids) { + vector& onePost = getOnePost(id); + if(MaxPostingSize < onePost.size()) MaxPostingSize = onePost.size(); + } + } + return MaxPostingSize; + } + /* + * Before getOnePost are called, the data should be + * pre-loaded using the function loadOnePost or loadPostings; + * an exception is thrown otherwise. + */ + vector& getOnePost(uint32_t postId) { + if (readPostings.find(postId) == readPostings.end()) { + throw runtime_error("Should call loadIntegers before can access postings!"); + } + return readPostings[postId]; + } + + size_t getSizeInInts(uint32_t postId) { + return getOnePost(postId).size(); + } + + /** + * Flushes all posting lists, recovering the memory. + */ + void clear() { + readPostings.clear(); + memUsed = 0; + } + + /** + * Given a set of posting IDs, compute the corresponding intersection. + * This is *specifically* not meant to be fast. Do not use if you need + * good speed. (It *may* be fast... but it is not the design goal.) + * + * The data should be pre-loaded using the function loadOnePost or loadPostings; + * an exception is thrown otherwise. + */ + vector computeIntersection(const vector & qids, intersectionfunction Inter) { + vector inter; + if(qids.empty()) return inter; + vector > sizeids; + for(uint32_t i : qids) { + sizeids.emplace_back(make_pair(getOnePost(i).size(),i)); + } + sort(sizeids.begin(), sizeids.end()); + inter = getOnePost(sizeids.front().second); + size_t intersize = inter.size(); + for(size_t k = 1; k seenOffsets; // location of the postings + unordered_map > readPostings; // accumulated postings + + MaropuGapReader gapReader; // reader to recover the postings + size_t memBudget; + size_t memUsed; +}; + + + + +#endif /* BUDGETEDPOSTINGCOLLECTOR_H_ */ diff --git a/advancedbenchmarking/include/maropuparser.h b/advancedbenchmarking/include/maropuparser.h new file mode 100644 index 0000000..d3d6bce --- /dev/null +++ b/advancedbenchmarking/include/maropuparser.h @@ -0,0 +1,167 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#ifndef MAROPUPARSER_H_ +#define MAROPUPARSER_H_ + +#include +#include + +#include + +#include "common.h" + +using namespace std; + +/** + * This is just a bit of code to parse the binary files provided by the + * Maropu-Open-Coders library at + * http://integerencoding.isti.cnr.it/?page_id=8 + * + * (Despite the name, this does not necessarily reads gaps.) + * + * Note that due to use of strerror this code may be thread-unsafe! + * + */ +class MaropuGapReader { +public: + MaropuGapReader(const string & filename) : + mFilename(filename), fd(NULL) { + } + + + /** + * The copy constructor will assign the same file name, + * but the newly constructed object won't be opened. + */ + MaropuGapReader(const MaropuGapReader & mgr) : + mFilename(mgr.mFilename), fd(NULL) { + } + + /** + * Assignment will close the current reader, and change + * the file name. You need to reopen the reader after the assignment. + */ + MaropuGapReader& operator=(const MaropuGapReader & mgr) { + close(); + mFilename = mgr.mFilename; + return *this; + } + + ~MaropuGapReader() { + close(); + } + + // @daniel: should we worry about our code being compilable on 32-bit machines? + // if so, we need to add -D_FILE_OFFSET_BITS=64 to the makefile + // Daniel: it would seem odd to consider 32-bit machines when we assume AVX support! + off_t getPos() { + errno = 0; + off_t res = ftello(fd); + if (res < 0) { + stringstream err; + err << "Error getting file position, IO status: " << strerror(errno); + throw runtime_error(err.str()); + } + return res; + } + + void setPos(off_t pos) { + errno = 0; + off_t res = fseeko(fd, pos, SEEK_SET); + if (res < 0) { + stringstream err; + err << "Error setting file position, IO status: " << strerror(errno); + throw runtime_error(err.str()); + } + } + + /* + * Return false if no more data can be loaded. + * Throw an exception in the case of IO error. + */ + template + bool loadIntegers(container & buffer) { + uint32_t qty = 0; + if (!ReadQty(qty)) return false; // EOF + buffer.resize(qty); + errno = 0; + size_t result = fread(buffer.data(), sizeof(uint32_t), buffer.size(), fd); + if (result != buffer.size()) { + if (!errno) { + // If we can't read, the file maybe truncated, i.e., corrupt + throw runtime_error("The file appears to be truncated/corrupt!"); + } + stringstream err; + err << "Error reading from file, IO status: " << strerror(errno); + throw runtime_error(err.str()); + } + return true; + } + + /* + * Return false if no more data can be loaded. + * Throw an exception in the case of IO error. + */ + bool readNextPosAndQty(off_t& pos, uint32_t& qty) { + pos = getPos(); + if (!ReadQty(qty)) return false; // EOF + setPos(getPos() + qty * sizeof(uint32_t)); + return true; + } + + /** + * We must call open before we can use this class meaningfully. + */ + bool open() { + close(); + fd = ::fopen(mFilename.c_str(), "rb"); + if (fd == NULL) { + return false; + } + setvbuf (fd , NULL , _IOFBF , 1024*4 ); // large buffer + return true; + } + + void close() { + if (fd != NULL) { + ::fclose(fd); + fd = NULL; + } + } +private: + /* + * Returns false on EOF. + * Throws an exception in the case of IO error. + */ + bool ReadQty(uint32_t& qty) { + qty = 0; + if(fd == NULL) { + throw runtime_error("You forgot to open the file."); + } + errno = 0; + size_t result = fread(&qty, sizeof(qty), 1, fd); + if (errno) { + stringstream err; + err << "Error opening file, IO status: " << strerror(errno); + throw runtime_error(err.str()); + } + if (result != 1) { + return false; + } + if(qty > 1<<29) { + cout << "warning: reading a very large array ("<< qty << " integers) : is your input file in the right format?"< + inline void endQuery(size_t intersectionsize, SizeProvider& sizeprovider, + const vector& PostIds) { + uint64_t timeinmicroseconds = timer.split(); + timesinmicros.push_back(timeinmicroseconds); + ss << timeinmicroseconds << "\t"; + ss << intersectionsize << "\t"; + for(uint32_t id: PostIds) { + ss << sizeprovider.getSizeInInts(id) << "\t"; + } + ss << endl; + } + + // average time in ms per query + double averageTimeInMS() const { + if(timesinmicros.size() == 0) return 0; + return static_cast(std::accumulate(timesinmicros.begin(),timesinmicros.end(),0)) * 0.001 / + static_cast(timesinmicros.size()); + } + + + // average time in ms per query + double medianTimeInMS() const { + if(timesinmicros.size() == 0) return 0; + vector buffer(timesinmicros); + sort(buffer.begin(),buffer.end()); + return static_cast(buffer[buffer.size()/2]) * 0.001;// not *exactly* the median but close enough + } + + // average time in ms per query + double ninetypercentileTimeInMS() const { + if(timesinmicros.size() == 0) return 0; + vector buffer(timesinmicros); + sort(buffer.begin(),buffer.end()); + return static_cast(buffer[static_cast(round(buffer.size()*0.9))]) * 0.001;// not *exactly* the 90 perc. but close enough + } + + + /** + * Dump the CSV data + */ + void output(ostream & out) { + out << ss.str(); + } + + void reset() { + ss.str(""); + ss.seekp(0); + ss.seekg(0); + timesinmicros.clear(); + } + + +private: + stringstream ss; + WallClockTimer timer; + vector timesinmicros; +}; + +#endif /* STATISTICSRECORDER_H_ */ diff --git a/advancedbenchmarking/src/budgetedtest.cpp b/advancedbenchmarking/src/budgetedtest.cpp new file mode 100644 index 0000000..f14dcaa --- /dev/null +++ b/advancedbenchmarking/src/budgetedtest.cpp @@ -0,0 +1,1005 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +# +#include +#include + + +#include "common.h" +#include "util.h" +#include "timer.h" +#include "maropuparser.h" +#include "codecfactory.h" +#include "budgetedpostingcollector.h" +#include "skipping.h" +#include "hybm2.h" +#include "statisticsrecorder.h" + + +void printusage(char *prog) { + cout << "Usage: " << prog << " -b -s -l -i -o " << endl; + cout << " -s specify compression scheme ( " ; + for (string s: CODECFactory::allNames()) cout<< s <<" "; + cout<<")"<>& allPostIds) { + unordered_map > compPostings; // we use a hash table to store the postings, ought to fit in RAM + size_t MaxPostingSize(0); + + vector < uint32_t > dirtyCopy; + WallClockTimer z;// this is use only to time what we care + WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about + // 1. Benchmark only compression + for (const vector & qids: allPostIds) { + for (uint32_t id: qids) { + if(compPostings.find(id) != compPostings.end()) // these hits could potentially slow us down a bit + continue; // was compressed earlier + vector& onePost = uncompPosts.getOnePost(id); + size_t qty = onePost.size(); + vector < uint32_t > compressedBuffer(qty); // this is potential a bit expensive: memory allocation + if (MaxPostingSize < qty) { + MaxPostingSize = qty; + } + if(modifiesinput) // if true there is a copy which might be expensive + dirtyCopy = onePost; // some schemes might modify the input, hence it is necessary to make a copy in general + size_t nvalue = compressedBuffer.size(); + ////////////// + // BEGIN performance-sensitive section for *compression* + // (we don't care that much about it). + ///////////// + z.reset(); + if(modifiesinput) + scheme.encodeArray(dirtyCopy.data(), dirtyCopy.size(), + compressedBuffer.data(), nvalue); + else + scheme.encodeArray(onePost.data(), onePost.size(), + compressedBuffer.data(), nvalue); + packTime += static_cast (z.split()); + ///////////// + // END performance-sensitive code for *compression* + ///////////// + assert(nvalue <= compressedBuffer.size()); + packVolume += qty; + CompressedSizeDuringPacking += nvalue; + compressedBuffer.resize(nvalue); + compressedBuffer.shrink_to_fit();// this may or may not be useful + compPostings.emplace(id, compressedBuffer);// with some luck, compressedBuffer is *moved* to the container, not copied + } + } + CoarsePackTime += static_cast(coarsez.split()); + + vector < uint32_t > recoveryBuffer(MaxPostingSize+1024); + if(MaxRecoveryBuffer < recoveryBuffer.size()) MaxRecoveryBuffer=recoveryBuffer.size(); + vector< uint32_t > intersection_result(recoveryBuffer.size()); + // pre2: verify results + /** + * We first test that the compressed version can be uncompressed back + * to the original (duh!) + */ + for(unordered_map >::iterator i = compPostings.begin() ; i != compPostings.end(); ++i ) { + const vector & compressed = i->second; + const vector & uncompressed = uncompPosts.getOnePost(i->first); + vector recbuffer(uncompressed.size() + 1024); + size_t recoveredsize = recbuffer.size(); + scheme.decodeArray(compressed.data(), compressed.size(), + recbuffer.data(), recoveredsize); + recbuffer.resize(recoveredsize); + if(recbuffer.size() != uncompressed.size()) { + cout<<"Original array had size "< & qids: allPostIds) { + // we begin by recoving references to the posting lists. + vector * > > queryCompPost; + for (uint32_t id: qids) { + const vector& onePost = uncompPosts.getOnePost(id); + queryCompPost.emplace_back(make_pair(onePost.size(), & compPostings[id])); + } + assert(!queryCompPost.empty()); + // Sort in the order of increasing posting size + // Daniel: this is to make the SvS intersection faster... + sort(queryCompPost.begin(), queryCompPost.end()); + // the first posting list is a particular case, so we do it separately. + size_t intersectioncardinality = intersection_result.size(); + scheme.decodeArray(queryCompPost.front().second->data(), queryCompPost.front().second->size(), + intersection_result.data(), intersectioncardinality); + assert(intersectioncardinality<=intersection_result.size()); + for (size_t i = 1; (intersectioncardinality>0) && (i < queryCompPost.size()); ++i) { + size_t recoveredsize = recoveryBuffer.size(); + scheme.decodeArray(queryCompPost[i].second->data(), queryCompPost[i].second->size(), + recoveryBuffer.data(), recoveredsize); + assert(recoveredsize<=recoveryBuffer.size()); + intersectioncardinality = Inter(intersection_result.data(),intersectioncardinality, + recoveryBuffer.data(), recoveredsize,intersection_result.data()); + } + vector trueintersection = uncompPosts.computeIntersection(qids,onesidedgallopingintersection); + if(trueintersection.size() != intersectioncardinality) { + cout<<"expected cardinality: "< & qids: allPostIds) { + SR.prepareQuery(); + // we begin by recoving references to the posting lists. + vector * > > queryCompPost; + for (uint32_t id: qids) { + const vector& onePost = uncompPosts.getOnePost(id); + queryCompPost.emplace_back(make_pair(onePost.size(), & compPostings[id])); + } + assert(!queryCompPost.empty()); + // Sort in the order of increasing posting size + // Daniel: this is to make the SvS intersection faster... + // we don't time this operation + sort(queryCompPost.begin(), queryCompPost.end()); + // the first posting list is a particular case, so we do it separately. + + // we use pointers explicitely to me it easier for non-STL folks + const uint32_t * input = queryCompPost.front().second->data(); + size_t inputsize = queryCompPost.front().second->size(); + uint32_t * const intersectionbuffer = intersection_result.data(); + + ////////////////////// + // BEGIN performance-sensitive section + // Note that we possibly uncompress to RAM, and not to cache. + // Moreover, input might not be in cache. + ////////////////////// + z.reset(); + size_t intersectioncardinality = intersection_result.size(); + scheme.decodeArray(input,inputsize , + intersectionbuffer, intersectioncardinality); + unpackTime += static_cast(z.split()); + ///////////////////// + // END performance-sensitive section + ///////////////////// + assert(intersectioncardinality<=intersection_result.size()); + unpackVolume += intersectioncardinality; + uint32_t * const recoverybuffer = recoveryBuffer.data(); + for (size_t i = 1; (intersectioncardinality >0) && (i < queryCompPost.size()); ++i) { + size_t recoveredsize = recoveryBuffer.size(); + // again we use explicit pointers to make it easier for non-STL people + input = queryCompPost[i].second->data(); + inputsize = queryCompPost[i].second->size(); + ///////////////////////// + // BEGIN performance-sensitive section + // Note that input might not be in cache, and that + // output might be to RAM + //////////////////////// + z.reset(); + scheme.decodeArray(input,inputsize,recoverybuffer , recoveredsize); + unpackTime += static_cast(z.split()); + ///////////////////////// + // END performance-sensitive section + ///////////////////////// + assert(recoveredsize<=recoveryBuffer.size()); + unpackVolume += recoveredsize; + ///////////////////////// + // BEGIN performance-sensitive section for intersections. + // Both inputs could be in RAM, not in cache. + //////////////////////// + z.reset(); + intersectioncardinality = Inter(intersectionbuffer,intersectioncardinality, + recoverybuffer, recoveredsize,intersectionbuffer); + interTime += static_cast(z.split()); + //////////////////////////////// + // END performance-sensitive section for intersections. + /////////////////////////////// + } + SR.endQuery(intersectioncardinality, uncompPosts,qids) ; + } + CoarseUnpackInterTime += static_cast(coarsez.split()); + } + + + + + /** + * This is a version of "tests" where compressed posting lists are divided it up in NumberOfPartitions partitions. + * If NumberOfPartitions is large enough, uncompressed posting lists will reside in cache, not in RAM. + */ + void splittedtest(int NumberOfPartitions, + BudgetedPostingCollector& uncompPosts, + const vector>& allPostIds) { + // we index on posting ID, and point to a vector of compressed postings + unordered_map > > compPostings; + unordered_map > uncompsizes; + + pair < uint32_t, uint32_t > range = uncompPosts.findDocumentIDRange( + allPostIds); + vector < uint32_t > bounds(NumberOfPartitions + 1); + for (int part = 0; part < NumberOfPartitions; ++part) + bounds[part] = range.first + part * (range.second - range.first) + / NumberOfPartitions;// slightly uneven + bounds[NumberOfPartitions] = range.second + 1; + vector < uint32_t > maxSize(NumberOfPartitions);// will contain the max size of a partition of a posting list + for (const vector & qids: allPostIds) { + for (uint32_t id: qids) { + vector& onePost = uncompPosts.getOnePost(id); + vector::iterator i = onePost.begin(); + for(int part = 0; part < NumberOfPartitions; ++part) { + vector::iterator j = lower_bound(i,onePost.end(),bounds[part+1]); + uint32_t thissize = static_cast(j - i); + if(thissize > maxSize[part]) maxSize[part] = thissize; + i = j; + } + } + } + uint32_t AbsoluteMaxSize = *max_element(maxSize.begin(),maxSize.end()); + WallClockTimer z;// this is use only to time what we care + WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about + // 1. Benchmark only compression + for (const vector & qids: allPostIds) { + for (uint32_t id: qids) { + if(compPostings.find(id) != compPostings.end()) // these hits could potentially slow us down a bit + continue; // was compressed earlier + vector& onePost = uncompPosts.getOnePost(id); + ////////////// + // BEGIN performance-sensitive section for *compression* + // (we don't care that much about it). + ///////////// + z.reset(); + + vector::iterator i = onePost.begin(); + vector > subposts(NumberOfPartitions); + vector subsizes(NumberOfPartitions); + uint32_t sanitycheck = 0; + for(int part = 0; (i != onePost.end()) && (part < NumberOfPartitions); ++part) { + vector compressedBuffer; + vector::iterator j = lower_bound(i,onePost.end(),bounds[part+1]); + uint32_t thissize = static_cast(j - i); + if(j!= onePost.end()) assert(*j>=bounds[part+1]); + assert(*i >=bounds[part]); + subsizes[part] = thissize; + sanitycheck += thissize; + if(thissize != 0) { + compressedBuffer.resize(thissize+128); + size_t nvalue = compressedBuffer.size(); + vector dirtyCopy(i,j); // we make a copy because (1) some schemes modify input (2) we need 128-bit alignment + scheme.encodeArray(dirtyCopy.data(), thissize, + compressedBuffer.data(), nvalue); + compressedBuffer.resize(nvalue); + } + subposts[part] = compressedBuffer; + i = j; + + } + assert(i == onePost.end()); + assert(sanitycheck == onePost.size()); + compPostings.emplace(id,subposts); + uncompsizes.emplace(id,subsizes); + packTime += static_cast (z.split()); + } + } + CoarsePackTime += static_cast (coarsez.split()); + vector recoveryBuffer(AbsoluteMaxSize); + if(MaxRecoveryBuffer < AbsoluteMaxSize) MaxRecoveryBuffer = AbsoluteMaxSize; + vector < vector > intersection_result; + for(uint32_t size : maxSize) { + vector intersectionbuffer(size+128); + intersection_result.emplace_back(intersectionbuffer); + } + coarsez.reset(); + // 2. Test full cycle (decompression + intersection) + for (const vector & qids: allPostIds) { + SR.prepareQuery(); + size_t totalintercardinality = 0; + for(int part = 0; part < NumberOfPartitions; ++part) { + vector * > > queryCompPost; + for (uint32_t id: qids) { + uint32_t myuncompsize = uncompsizes[id][part]; + const vector * compressedposting = & compPostings[id][part]; + queryCompPost.emplace_back(make_pair(myuncompsize, compressedposting)); + } + sort(queryCompPost.begin(), queryCompPost.end()); + const uint32_t * input = queryCompPost.front().second->data(); + size_t inputsize = queryCompPost.front().second->size(); + uint32_t * const intersectionbuffer = intersection_result[part].data(); + z.reset(); + size_t intersectioncardinality = intersection_result[part].size(); + if(inputsize == 0) + intersectioncardinality = 0; + else + scheme.decodeArray(input,inputsize , + intersectionbuffer, intersectioncardinality); + unpackTime += static_cast(z.split()); + assert(intersectioncardinality<=intersection_result[part].size()); + unpackVolume += intersectioncardinality; + uint32_t * const recoverybuffer = recoveryBuffer.data(); + for (size_t i = 1; (intersectioncardinality>0) && (i < queryCompPost.size()); ++i) { + // again we use explicit pointers to make it easier for non-STL people + size_t recoveredsize = recoveryBuffer.size(); + input = queryCompPost[i].second->data(); + inputsize = queryCompPost[i].second->size(); + ///////////////////////// + // BEGIN performance-sensitive section + // Note that input might not be in cache, and that + // output might be to RAM + //////////////////////// + z.reset(); + scheme.decodeArray(input,inputsize,recoverybuffer , recoveredsize); + unpackTime += static_cast(z.split()); + assert(recoveredsize<=recoveryBuffer.size()); + ///////////////////////// + // END performance-sensitive section + ///////////////////////// + unpackVolume += recoveredsize; + ///////////////////////// + // BEGIN performance-sensitive section for intersections. + // Both inputs could be in RAM, not in cache. + //////////////////////// + z.reset(); + intersectioncardinality = Inter(intersectionbuffer,intersectioncardinality, + recoverybuffer, recoveredsize,intersectionbuffer); + interTime += static_cast(z.split()); + //////////////////////////////// + // END performance-sensitive section for intersections. + /////////////////////////////// + } + totalintercardinality += intersectioncardinality; + } + SR.endQuery(totalintercardinality, uncompPosts,qids); + } + CoarseUnpackInterTime += static_cast (coarsez.split()); + } + + void skippingtest(int SkipLog, BudgetedPostingCollector& uncompPosts, + const vector>& allPostIds) { + unordered_map > compPostings; // we use a hash table to store the postings, ought to fit in RAM + size_t MaxPostingSize(0); + + WallClockTimer z;// this is use only to time what we care + WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about + // 1. Benchmark only compression + for (const vector & qids: allPostIds) { + for (uint32_t id: qids) { + if(compPostings.find(id) != compPostings.end()) // these hits could potentially slow us down a bit + continue; // was compressed earlier + vector& onePost = uncompPosts.getOnePost(id); + z.reset(); + compPostings.emplace(id , shared_ptr< Skipping>(new Skipping(SkipLog,onePost.data(),onePost.size()))); + packTime += static_cast (z.split()); + size_t qty = onePost.size(); + if (MaxPostingSize < qty) { + MaxPostingSize = qty; + } + packVolume += qty; + CompressedSizeDuringPacking += (compPostings[id]->storageInBytes()+sizeof(uint32_t)-1)/sizeof(uint32_t); + } + } + CoarsePackTime += static_cast (coarsez.split()); + vector < uint32_t > recoveryBuffer(MaxPostingSize + 1024); + if (MaxRecoveryBuffer < recoveryBuffer.size()) + MaxRecoveryBuffer = recoveryBuffer.size(); + vector < uint32_t > intersection_result(recoveryBuffer.size()); + coarsez.reset(); + for(vector qids : allPostIds) { + if(qids.empty()) continue;// odd but could happen? + SR.prepareQuery(); + vector > sizeids; + for(uint32_t i : qids) { + sizeids.emplace_back(make_pair(compPostings[i]->Length,i)); + } + sort(sizeids.begin(), sizeids.end()); + size_t intersize; + if(sizeids.size() == 1) { + intersize = compPostings[sizeids.front().second]->decompress(intersection_result.data()); + unpackVolume += intersize; + } else { + assert(compPostings.size()>=2); + intersize = compPostings[sizeids[0].second]->intersect(*compPostings[sizeids[1].second],intersection_result.data()); + unpackVolume += compPostings[sizeids[0].second]->Length; + unpackVolume += compPostings[sizeids[1].second]->Length; + for(size_t k = 2; (intersize>0) && (k < sizeids.size()); ++k) { + unpackVolume += compPostings[sizeids[k].second]->Length; + intersize = compPostings[sizeids[k].second]->intersect(intersection_result.data(),intersize,intersection_result.data()); + } + } + SR.endQuery(intersize, uncompPosts,qids); + } + CoarseUnpackInterTime += static_cast (coarsez.split()); + } + + void bitmaptest(uint32_t th, BudgetedPostingCollector& uncompPosts, + const vector>& allPostIds) { + uint32_t MaxId = uncompPosts.findDocumentIDRange(allPostIds).second; + size_t MaxPostingSize(0); + HybM2 hybrid(scheme, Inter, MaxId, th); + WallClockTimer z;// this is use only to time what we care + WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about + // 1. Benchmark only compression + for (const vector & qids: allPostIds) { + for (uint32_t id: qids) { + if(hybrid.hasBeenLoaded(id)) continue; + vector& onePost = uncompPosts.getOnePost(id); + z.reset(); + CompressedSizeDuringPacking += hybrid.load(id, onePost.data(), onePost.size()); + packTime += static_cast (z.split()); + size_t qty = onePost.size(); + packVolume += qty; + if (MaxPostingSize < qty) { + MaxPostingSize = qty; + } + } + } + CoarsePackTime += static_cast (coarsez.split()); + vector < uint32_t > intersection_result(MaxPostingSize + 1024); + if (MaxRecoveryBuffer < hybrid.sizeOfRecoveryBufferInWords()) + MaxRecoveryBuffer = hybrid.sizeOfRecoveryBufferInWords(); + // testing round + for(vector qids : allPostIds) { + size_t sizeout = intersection_result.size(); + hybrid.intersect(qids, intersection_result.data(), sizeout); + vector trueintersection = uncompPosts.computeIntersection(qids,onesidedgallopingintersection); + if(trueintersection.size() != sizeout) throw runtime_error("not even same cardinality"); + for(uint32_t k = 0; k < sizeout;++k) + if(trueintersection[k]!=intersection_result[k]) throw runtime_error("intersection bug"); + } + + coarsez.reset(); + for(vector qids : allPostIds) { + SR.prepareQuery(); + size_t sizeout = intersection_result.size(); + unpackVolume += hybrid.intersect(qids, intersection_result.data(), sizeout); + SR.endQuery(sizeout, uncompPosts,qids); + } + CoarseUnpackInterTime += static_cast (coarsez.split()); + } + + + + void bitmapskippingtest(uint32_t BS, uint32_t th, BudgetedPostingCollector& uncompPosts, + const vector>& allPostIds) { + uint32_t MaxId = uncompPosts.findDocumentIDRange(allPostIds).second; + size_t MaxPostingSize(0); + SkippingHybM2 hybrid(MaxId, th, BS); + WallClockTimer z;// this is use only to time what we care + WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about + // 1. Benchmark only compression + for (const vector & qids: allPostIds) { + for (uint32_t id: qids) { + if(hybrid.hasBeenLoaded(id)) continue; + vector& onePost = uncompPosts.getOnePost(id); + z.reset(); + CompressedSizeDuringPacking += hybrid.load(id, onePost.data(), onePost.size()); + packTime += static_cast (z.split()); + size_t qty = onePost.size(); + packVolume += qty; + if (MaxPostingSize < qty) { + MaxPostingSize = qty; + } + } + } + CoarsePackTime += static_cast (coarsez.split()); + vector < uint32_t > intersection_result(MaxPostingSize + 1024); + // testing round + for(vector qids : allPostIds) { + size_t sizeout = intersection_result.size(); + hybrid.intersect(qids, intersection_result.data(), sizeout); + vector trueintersection = uncompPosts.computeIntersection(qids,onesidedgallopingintersection); + if(trueintersection.size() != sizeout) throw runtime_error("not even same cardinality"); + for(uint32_t k = 0; k < sizeout;++k) + if(trueintersection[k]!=intersection_result[k]) throw runtime_error("intersection bug"); + } + + coarsez.reset(); + for(vector qids : allPostIds) { + SR.prepareQuery(); + size_t sizeout = intersection_result.size(); + unpackVolume += hybrid.intersect(qids, intersection_result.data(), sizeout); + SR.endQuery(sizeout, uncompPosts,qids); + } + CoarseUnpackInterTime += static_cast (coarsez.split()); + } + + + + void uncompressedbitmaptest(uint32_t th, BudgetedPostingCollector& uncompPosts, + const vector>& allPostIds) { + uint32_t MaxId = uncompPosts.findDocumentIDRange(allPostIds).second; + size_t MaxPostingSize(0); + UncompressedHybM2 hybrid( Inter, MaxId, th); + WallClockTimer z;// this is use only to time what we care + WallClockTimer coarsez;// for "coarse" timings, it includes many things we don't care about + // 1. Benchmark only compression + for (const vector & qids: allPostIds) { + for (uint32_t id: qids) { + if(hybrid.hasBeenLoaded(id)) continue; + vector& onePost = uncompPosts.getOnePost(id); + z.reset(); + CompressedSizeDuringPacking += hybrid.load(id, onePost.data(), onePost.size()); + packTime += static_cast (z.split()); + size_t qty = onePost.size(); + packVolume += qty; + if (MaxPostingSize < qty) { + MaxPostingSize = qty; + } + } + } + CoarsePackTime += static_cast (coarsez.split()); + vector < uint32_t > intersection_result(MaxPostingSize + 1024); + // testing round + for(vector qids : allPostIds) { + size_t sizeout = intersection_result.size(); + hybrid.intersect(qids, intersection_result.data(), sizeout); + vector trueintersection = uncompPosts.computeIntersection(qids,onesidedgallopingintersection); + if(trueintersection.size() != sizeout) throw runtime_error("not even same cardinality"); + for(uint32_t k = 0; k < sizeout;++k) + if(trueintersection[k]!=intersection_result[k]) throw runtime_error("intersection bug"); + } + + coarsez.reset(); + for(vector qids : allPostIds) { + SR.prepareQuery(); + size_t sizeout = intersection_result.size(); + unpackVolume += hybrid.intersect(qids, intersection_result.data(), sizeout); + SR.endQuery(sizeout, uncompPosts,qids); + } + CoarseUnpackInterTime += static_cast (coarsez.split()); + } + + + + /** + * This runs a a posting-list intersection tests without compression. + */ + void testUncompressed(BudgetedPostingCollector& uncompPosts, + const vector>& allPostIds) { + size_t MaxPostingSize = uncompPosts.findMaxPostingSize(allPostIds); + vector < uint32_t > inter(MaxPostingSize); + WallClockTimer coarsez; + for(vector qids : allPostIds) { + SR.prepareQuery(); + if(qids.empty()) continue;// odd but could happen? + vector > sizeids; + for(uint32_t i : qids) { + sizeids.emplace_back(make_pair(uncompPosts.getOnePost(i).size(),i)); + } + sort(sizeids.begin(), sizeids.end()); + vector * answer = & uncompPosts.getOnePost(sizeids.front().second); + size_t intersize = answer->size(); + unpackVolume += intersize; + for(size_t k = 1; (intersize>0) && (k < sizeids.size()); ++k) { + vector & nextone = uncompPosts.getOnePost(sizeids[k].second); + unpackVolume += nextone.size(); + intersize = Inter(answer->data(), intersize, + nextone.data(), nextone.size(), inter.data()); + answer = & inter; + } + SR.endQuery(intersize, uncompPosts,qids); + } + CoarseUnpackInterTime += static_cast (coarsez.split()); + } + + + void printNumbers(bool detailed) const { + cout< (unpackVolume) + / (CoarseUnpackInterTime) ; + cout << setw(10) << setprecision(4) + << SR.averageTimeInMS(); + cout << setw(10) << setprecision(4) + << SR.medianTimeInMS(); + cout << setw(10) << setprecision(4) + << SR.ninetypercentileTimeInMS()<< endl; + + return; + } + cout<<"# max recovery buffer = "<(MaxRecoveryBuffer*sizeof(uint32_t))/(1024.0*1024.0))<<"MB"<0) { + cout<<"# compression: number of integers + bits/int + speed (mis) + coarse speed (mis) " << endl; + cout<< setw(20)<(CompressedSizeDuringPacking * sizeof(uint32_t) ) * 8.0 / static_cast(packVolume) + << setw(20) << setprecision(4) << static_cast(packVolume) / packTime + << setw(20)<< setprecision(4) << static_cast(packVolume) / CoarsePackTime + << endl; + }; + if (unpackVolume>0) { + cout + << "# decompression: number of integers + "; + if(unpackTime>0) + cout<<" decompression speed (mis) +"; + if(interTime>0) + cout<<" intersection speed (mis) +"; + if((unpackTime+interTime)>0) + cout<<" total speed (mis) + "; + cout << "coarse total speed (mis) +"; + cout << "avg (ms/query) + median (ms/query) + 90perc (ms/query)" + << endl; + cout << setw(10) << unpackVolume; + if (unpackTime > 0) + cout << setw(10) << setprecision(4) + << static_cast (unpackVolume) / unpackTime; + if (interTime > 0) + cout << setw(10) << setprecision(4) + << static_cast (unpackVolume) / interTime; + if ((unpackTime + interTime) > 0) + cout << setw(10) << setprecision(4) + << static_cast (unpackVolume) / (unpackTime + + interTime); + cout << setw(10) << setprecision(4) + << static_cast (unpackVolume) + / (CoarseUnpackInterTime); + cout << setw(10) << setprecision(4) + << SR.averageTimeInMS(); + cout << setw(10) << setprecision(4) + << SR.medianTimeInMS(); + cout << setw(10) << setprecision(4) + << SR.ninetypercentileTimeInMS()<< endl; + }; + + cout<::max(); + + bool useCompression = false; + int SkipLog = 0; + int th = -1; + + string scheme = "simdfastpforr"; + int partitions = 1; + bool dumpcompletestats = false; + + size_t memBudget = 1024ULL * 1024 * 1024 * 4; // 4GB seems like a better default than 16GB + + int c; + while ((c = getopt(argc, argv, "i:os:b:hl:p:qk:B:d")) != -1) { + switch (c) { + case 'd': + dumpcompletestats = true; + break; + case 'i': + InterName = optarg; + if( ! IntersectionFactory::valid(InterName )) { + cerr << "I don't recognize the intersection scheme '"< 31)) { + cerr<<"Skip param needs to be within [1,31]."<= argc) { + printusage(argv[0]); + return -1; + } + if(!CODECFactory::valid(scheme)) { + cout<<"Compression scheme "< 0) && (useCompression)) { + cout<<"conflicting options. You cannot mix skipping and compression."<(memBudget) / 1024.0 / 1024.0 / 1024.0 << "GB" << endl; + cout << "# Maximum number of queries: " << maxLinesToProcess << endl; + if(SkipLog != 0) { + cout<<"# testing skipping with block size: "<<(1<=0) { + cout<<"# bitmap threshold: "<=0) { + if(th == 0) + cout<<"# bitmap threshold: automatic (0)"<1) + cout<<"# number of partitions: "<=0) + cout<<"# bitmap threshold: "<> allPostIds; // this a buffer where queries are stored. + + size_t skippedQty = 0, lineQty = 0, randPickQty = 0; + + bool detailedDisplay = useCompression || (SkipLog!=0) || (th>=0); + + + vector oneQueryPostIds; // buffer where a single query is stored + for (; (static_cast(lineQty - skippedQty) < maxLinesToProcess ) && logFile && getline(logFile, line); ++lineQty) { + stringstream lineStr(line); + oneQueryPostIds.clear(); + + { + uint32_t id; + while (lineStr >> id) { + if(uncompPosts.valid(id)) oneQueryPostIds.emplace_back(id); + } + } + if( oneQueryPostIds.empty() || + ( (! bIncludeOnePostQuery) && (oneQueryPostIds.size() == 1) ) + ) { + skippedQty++; + continue; + } + if (!uncompPosts.loadPostings(oneQueryPostIds)) {// we couldn't load them all in + if (SkipLog!=0) { + if(th>=0) { + testBed.bitmapskippingtest(SkipLog,th,uncompPosts, allPostIds); + } else { + testBed.skippingtest(SkipLog,uncompPosts, allPostIds); + } + } else if(useCompression) { + if(th>=0) + testBed.bitmaptest(th,uncompPosts, allPostIds); + else if(partitions > 1) + testBed.splittedtest(partitions,uncompPosts, allPostIds); + else + testBed.test(uncompPosts, allPostIds); + } else { + if(th>=0) + testBed.uncompressedbitmaptest(th,uncompPosts, allPostIds); + else + testBed.testUncompressed(uncompPosts, allPostIds); + } + if(! quiet) cout<<"# queries processed so far: "<<(lineQty - skippedQty) <=0) { + testBed.bitmapskippingtest(SkipLog,th,uncompPosts, allPostIds); + } else { + testBed.skippingtest(SkipLog,uncompPosts, allPostIds); + } + } else if(useCompression) { + if(th>=0) + testBed.bitmaptest(th,uncompPosts, allPostIds); + else if(partitions > 1) + testBed.splittedtest(partitions,uncompPosts, allPostIds); + else + testBed.test(uncompPosts, allPostIds); + } else { + if(th>=0) + testBed.uncompressedbitmaptest(th,uncompPosts, allPostIds); + else + testBed.testUncompressed(uncompPosts, allPostIds); + } + testBed.printNumbers(detailedDisplay); + + cout << "# Total lines: " << lineQty << + " processed: " << (lineQty - skippedQty) << + " skipped: " << skippedQty << " assigned randomly: " << randPickQty < + +#include "common.h" +#include "util.h" +#include "timer.h" +#include "maropuparser.h" +#include "codecfactory.h" + + +void printusage() { + cout << " Try ./compress -s nameofscheme input.bin output.bin" << endl; + +} + +int main(int argc, char **argv) { + string scheme; + int c; + while ((c = getopt(argc, argv, "s:h")) != -1) + switch (c) { + case 's': + scheme = optarg; + break; + case 'h': + printusage(); + return 0; + default: + abort(); + } + if(optind + 1 >= argc) { + printusage(); + return -1; + } + + try { + string ifilename = argv[optind]; + string ofilename = argv[optind + 1]; + + shared_ptr schemeptr = CODECFactory::getFromName(scheme); + if(schemeptr.get() == NULL) return -1; + + + MaropuGapReader reader (ifilename); + if (!reader.open()) { + cout << " could not open " << ifilename << " for reading..." << endl; + return -1; + } + FILE * fd = ::fopen(ofilename.c_str(), "wb"); + if (fd == NULL) { + cout << " could not open " << ofilename << " for writing..." << endl; + return -1; + } + cout<<"Compressing content from "<< ifilename << " to " << ofilename << " using " << scheme << endl; + vector buffer; + + // write a format version number + uint32_t VERSION = 1; + if (fwrite(&VERSION, sizeof(VERSION), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + reader.close(); + return -1; + } + // store the scheme identifier + uint32_t schemesize = static_cast(scheme.size() * sizeof(char)); + if (fwrite(&schemesize, sizeof(schemesize), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + reader.close(); + return -1; + } + + if (fwrite(scheme.c_str(), scheme.size() * sizeof(char), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + reader.close(); + return -1; + } + vector obuffer(1024); + size_t numberofarrays = 0; + size_t volume = 0; + size_t volumeout = 0; + WallClockTimer z; + while(reader.loadIntegers(buffer)) { + if(obuffer.size() < buffer.size()) { + obuffer.resize(buffer.size() + 1024); + } + size_t outsize = obuffer.size(); + schemeptr->encodeArray(buffer.data(), buffer.size(), obuffer.data(),outsize); + uint32_t osize = static_cast(outsize); + if (fwrite(&osize, sizeof(uint32_t), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + return -1; + } + uint32_t insize = static_cast(buffer.size()); + if (fwrite(&insize, sizeof(uint32_t), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + return -1; + } + if (fwrite(obuffer.data(), sizeof(uint32_t) * outsize, 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + return -1; + } + volumeout += sizeof(osize) + sizeof(insize) + sizeof(uint32_t) * outsize; + volume += buffer.size(); + if(numberofarrays % 1000 == 0 ) { + cout <<"."; + cout.flush(); + } + ++numberofarrays; + } + cout< 0) + cout<<"Bits per int : "<< (8.0 * static_cast(volumeout) )/static_cast(volume)< 0 ) { + double speed = static_cast(volume) /static_cast(ti); + cout<<" Speed: "< +#include + +#include "common.h" +#include "maropuparser.h" +#include "util.h" +#include "delta.h" +#include "budgetedpostingcollector.h" + +void message(const char * prog) { + cerr << " usage : " << prog << " maropubinaryfile querylogfile" + << endl; +} + +class EntropyRecorder { +public: + EntropyRecorder() : + counter(), totallength(0) { + } + + void clear() { + counter.clear(); + totallength = 0; + } + void eat(const uint32_t * in, const size_t length) { + if (length == 0) + return; + totallength += length; + for (uint32_t k = 0; k < length; ++k, ++in) { + maptype::iterator i = counter.find(*in); + if (i != counter.end()) + i->second += 1; + else + counter[*in] = 1; + } + } + + double computeShannon() { + double total = 0; + for (maptype::iterator i = counter.begin(); i + != counter.end(); ++i) { + const double x = static_cast(i->second); + total += x / static_cast(totallength) * log(static_cast(totallength) / x) / log(2.0); + } + return total; + } + + __attribute__ ((pure)) + double computeDataBits() { + double total = 0; + for (maptype::const_iterator i = counter.begin(); i + != counter.end(); ++i) { + total += static_cast(i->second) / static_cast(totallength) * static_cast(gccbits(i->first)); + } + return total; + } + typedef unordered_map maptype; + maptype counter; + size_t totallength; +}; + + +int main(int argc, char **argv) { + size_t maxLinesToProcess = numeric_limits::max(); + + size_t memBudget = 1024ULL * 1024 * 1024 * 2; // 2GB ought to be enough + bool bIncludeOnePostQuery = false; + + int c; + while ((c = getopt(argc, argv, "ob:l:h")) != -1) { + switch (c) { + case 'o': + bIncludeOnePostQuery = true; + break; + case 'b': + memBudget = 1024ULL * 1024 * 1024 * atol(optarg); + break; + case 'l': + maxLinesToProcess = atol(optarg); + break; + case 'h': + message(argv[0]); + return 0; + default: + message(argv[0]); + return -1; + } + } + + if(optind + 1 >= argc) { + message(argv[0]); + return -1; + } + cout << "# Memory budget for uncompressed postings: " << setprecision(2) + << static_cast(memBudget) / 1024.0 / 1024.0 / 1024.0 << "GB" << endl; + cout << "# Maximum number of queries: " << maxLinesToProcess << endl; + cout << "# Do we include one-word queries: " << (bIncludeOnePostQuery ? "yes":"no")<< endl; + string postFileName = argv[optind]; + string logFileName = argv[optind + 1]; + ifstream logFile(logFileName.c_str()); + if(!logFile.is_open()) { + cerr<<" Couldn't open query log file "< oneQueryPostIds; // buffer where a single query is stored + + for (; (static_cast(lineQty - skippedQty) < maxLinesToProcess ) && logFile && getline(logFile, line); ++lineQty) { + stringstream lineStr(line); + + oneQueryPostIds.clear(); + + { + uint32_t id; + while (lineStr >> id) { + if(uncompPosts.valid(id)) oneQueryPostIds.emplace_back(id); + } + } + if( oneQueryPostIds.empty() || + ( (! bIncludeOnePostQuery) && (oneQueryPostIds.size() == 1) ) + ) { + skippedQty++; + continue; + } + if (!uncompPosts.loadPostings(oneQueryPostIds)) {// we couldn't load them all in + uncompPosts.clear(); + assert(uncompPosts.getMemUsed() == 0); + if (!uncompPosts.loadPostings(oneQueryPostIds)) { + cerr << "Cannot load postings for the query '" << line + << "' after all postings are deleted. Perhaps, the memory budget is too small." << endl; + cerr << "Aborting!"< buffer = uncompPosts.getOnePost(id); + delta(0U,buffer.data(),buffer.size()); + er.eat(buffer.data(),buffer.size()); + } + + + } + logFile.close();// we are done + + + + + cout << "# next line is shannon entropy and data bits" << endl; + cout << er.computeShannon() << "\t" << er.computeDataBits() < +#include +#include + +#include "common.h" +#include "util.h" +#include "timer.h" +#include "maropuparser.h" +#include "codecfactory.h" + + +void printusage(char *prog) { + cout << "Usage: " << prog << " " << endl; + +} + +int main(int argc, char **argv) { + if(argc != 2) { + printusage(argv[1]); + return -1; + } + + + try { + string postFileName = argv[1]; + + MaropuGapReader reader (postFileName); + if (!reader.open()) { + cout << " could not open " << postFileName << " for reading..." << endl; + return -1; + } + + off_t pos; + uint32_t qty; + uint32_t postId = 0; + + while (reader.readNextPosAndQty(pos, qty)) { + cout << "id: " << postId << " qty: " << qty << "offset: " << pos << endl; + postId++; + if ((reader.getPos() - pos) != (qty + 1) * 4) { + cerr << "Internal error: unpexected diff in offsets!" << endl; + return -1; + } + } + + } catch (const exception& e) { + cerr << "Run-time error: " << e.what() << endl; + return -1; + } catch (...) { + cerr << "Unknown exception caught, terminating..." << endl; + return -1; + } + + return 0; +} + + diff --git a/advancedbenchmarking/src/simplesynth.cpp b/advancedbenchmarking/src/simplesynth.cpp new file mode 100644 index 0000000..51bcaa5 --- /dev/null +++ b/advancedbenchmarking/src/simplesynth.cpp @@ -0,0 +1,115 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ +# + +#include + +#include "common.h" +#include "util.h" +#include "timer.h" +#include "synthetic.h" + + +void printusage() { + cout << "This generate a file containing arrays. Each array is " + "written as a sequence of 32-bit unsigned integers (sorted) " + "preceded by a length indicated as a 32-bit unsigned integer. " + <= 32)) { + printusage(); + return -1; + } + break; + default: + abort(); + } + if(optind >= argc) { + printusage(); + return -1; + } + uint32_t Max = 1U<Max) { + printusage(); + return -1; + } + string ofilename = argv[optind]; + FILE * fd = ::fopen(ofilename.c_str(), "wb"); + if (fd == NULL) { + cout << " could not open " << ofilename << " for writing..." << endl; + return -1; + } + cout <<" generating "<< (cluster ? "cluster" : "uniform") << + " "<< howmany<< " arrays of length = "<< N << + " over "<< range<< " bits to file "<< ofilename < array = cluster ? cdg.generate(N,Max) : udg.generate(N,Max); + assert(array.size() == N); + if (fwrite(&N, sizeof(N), 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + return -1; + } + if (fwrite(array.data(), sizeof(uint32_t) * N, 1, fd) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + return -1; + } + if(i % 1000 == 0 ) { + cout <<"."; + cout.flush(); + } + } + cout<0) { + double speed = static_cast(howmany * N) /static_cast(ti); + cout<<" Speed: "< + +#include "common.h" +#include "util.h" +#include "timer.h" +#include "maropuparser.h" +#include "codecfactory.h" + + +void printusage() { + cout << " Try ./uncompress input.bin output.bin" << endl; + +} + +int main(int argc, char **argv) { + string scheme; + int c; + while ((c = getopt(argc, argv, "h")) != -1) + switch (c) { + case 'h': + printusage(); + return 0; + default: + abort(); + } + if(optind + 1 >= argc) { + printusage(); + return -1; + } + try { + string ifilename = argv[optind]; + string ofilename = argv[optind + 1]; + FILE * fd = ::fopen(ifilename.c_str(), "rb"); + if(fd == NULL) { + cerr <<" can't open "< b(schemesize+1); + result = fread(b.data(), schemesize*sizeof(char), 1, fd); + if (result != 1) { + ::fclose(fd); + ::fclose(fdout); + return -1; + } + string schemename (b.data()); + cout<<" data was compressed using "< schemeptr = CODECFactory::getFromName(schemename); + if(schemeptr.get() == NULL) { + return -1; + } + vector buffer, obuffer; + size_t i = 0; + size_t volume = 0; + WallClockTimer z; + while(true) { + uint32_t csize; + result = fread(&csize, sizeof(uint32_t), 1, fd); + if (result != 1) { + break; + } + uint32_t osize; + result = fread(&osize, sizeof(uint32_t), 1, fd); + if (result != 1) { + break; + } + obuffer.resize(osize); + buffer.resize(csize); + result = fread(buffer.data(), sizeof(uint32_t) * csize, 1, fd); + if (result != 1) { + break; + } + size_t sosize (osize); + schemeptr->decodeArray(buffer.data(), buffer.size(), obuffer.data(),sosize); + assert(sosize == osize); + if (fwrite(&osize, sizeof(osize), 1, fdout) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + ::fclose(fdout); + return -1; + } + if (fwrite(obuffer.data(), sizeof(uint32_t) * osize, 1, fdout) != 1) { + cerr << "aborting" << endl; + ::fclose(fd); + ::fclose(fdout); + return -1; + } + + if(i % 1000 == 0 ) { + cout <<"."; + cout.flush(); + } + volume += osize; + ++i; + + } + cout<0) { + double speed = static_cast(volume) /static_cast(ti); + cout<<" Speed: "< mydata(N); + for(uint32_t i = 0; i < N;++i) mydata[i] = 3*i; + /////////// + // + // You need some "output" container. You are responsible + // for allocating enough memory. + // + vector compressed_output(N+1024); + // N+1024 should be plenty + // + // + size_t compressedsize = compressed_output.size(); + codec.encodeArray(mydata.data(), mydata.size(), + compressed_output.data(), compressedsize); + // + // if desired, shrink back the array: + compressed_output.resize(compressedsize); + compressed_output.shrink_to_fit(); + // display compression rate: + cout<(compressed_output.size()) / + static_cast(mydata.size()) <<" bits per integer. "< mydataback(N); + size_t recoveredsize = mydataback.size(); + // + codec.decodeArray(compressed_output.data(), + compressed_output.size(), mydataback.data(), recoveredsize); + mydataback.resize(recoveredsize); + // + // That's it for compression! + // + if(mydataback != mydata) throw runtime_error("bug!"); + + // + // Next we are going to test out intersection... + // + vector mydata2(N); + for(uint32_t i = 0; i < N;++i) mydata2[i] = 6*i; + intersectionfunction inter = IntersectionFactory::getFromName("simd");// using SIMD intersection + // + // we are going to intersect mydata and mydata2 and write back + // the result to mydata2 + // + size_t intersize = inter(mydata2.data(),mydata2.size(),mydata.data(),mydata.size(),mydata2.data()); + mydata2.resize(intersize); + mydata2.shrink_to_fit(); + cout<<"Intersection size: " << mydata2.size() <<" integers. "<(initoffset,out); + initoffset = *(out+BitPackingHelpers::BlockSize - 1); + } + + static uint32_t maxbits(const uint32_t * in, uint32_t & initoffset) { + uint32_t accumulator = in[0] - initoffset; + for(uint32_t k = 1; k < BitPackingHelpers::BlockSize; ++k) { + accumulator |= in[k] - in[k-1]; + } + initoffset = in [BitPackingHelpers::BlockSize-1]; + return gccbits(accumulator); + } + + static void inline packblockwithoutmask(uint32_t * in, uint32_t * out, const uint32_t bit, uint32_t & initoffset ) { + const uint32_t nextoffset = *(in+BitPackingHelpers::BlockSize - 1); + if(bit<32) delta(initoffset,in); + BitPackingHelpers::fastpackwithoutmask(in,out,bit); + initoffset = nextoffset; + } + static string name() { + return "BasicBlockPacker"; + } + +}; + +struct NoDeltaBlockPacker { + static void inline unpackblock(const uint32_t * in, uint32_t * out, const uint32_t bit, uint32_t & ) { + BitPackingHelpers::fastunpack(in,out,bit); + } + static void inline packblockwithoutmask(uint32_t * in, uint32_t * out, const uint32_t bit, uint32_t & ) { + BitPackingHelpers::fastpackwithoutmask(in,out,bit); + } + + static uint32_t maxbits(const uint32_t * in, uint32_t & ) { + uint32_t accumulator = 0; + for(uint32_t k = 0; k < BitPackingHelpers::BlockSize; ++k) { + accumulator |= in[k]; + } + return gccbits(accumulator); + } + + static string name() { + return "NoDeltaBlockPacker"; + } +}; + + + +struct IntegratedBlockPacker { + __attribute__ ((pure)) + static uint32_t maxbits(const uint32_t * in, uint32_t & initoffset) { + uint32_t accumulator = in[0] - initoffset; + for(uint32_t k = 1; k < BitPackingHelpers::BlockSize; ++k) { + accumulator |= in[k] - in[k-1]; + } + initoffset = in [BitPackingHelpers::BlockSize-1]; + return gccbits(accumulator); + } + + static void inline packblockwithoutmask(const uint32_t * in, uint32_t * out, const uint32_t bit, uint32_t & initoffset ) { + BitPackingHelpers::integratedfastpackwithoutmask(initoffset,in,out,bit); + initoffset = *(in+BitPackingHelpers::BlockSize - 1); + } + static void inline unpackblock(const uint32_t * in, uint32_t * out, const uint32_t bit, uint32_t & initoffset ) { + BitPackingHelpers::integratedfastunpack(initoffset,in,out,bit); + initoffset = *(out+BitPackingHelpers::BlockSize - 1); + } + static string name() { + return "IntegratedBlockPacker"; + } +}; + + + +template +class BinaryPacking: public IntegerCODEC { +public: + + + static const uint32_t MiniBlockSize = 32; + static const uint32_t HowManyMiniBlocks = 4; + static const uint32_t BlockSize = MiniBlockSize;//HowManyMiniBlocks * MiniBlockSize; + static const uint32_t bits32 = 8 ; + + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + checkifdivisibleby(length, BlockSize); + const uint32_t * const initout(out); + *out++ = static_cast(length); + uint32_t Bs[HowManyMiniBlocks]; + uint32_t init = 0; + const uint32_t * const final = in + length; + for (; in + HowManyMiniBlocks * MiniBlockSize + <= final; in += HowManyMiniBlocks * MiniBlockSize) { + uint32_t tmpinit = init; + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize, tmpinit); + } + *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) + | Bs[3]; + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i],init); + out += Bs[i]; + } + } + if(in < final) { + size_t howmany = ( final - in ) /MiniBlockSize; + uint32_t tmpinit = init; + memset(&Bs[0],0,HowManyMiniBlocks*sizeof(uint32_t)); + for (uint32_t i = 0; i < howmany; ++i) { + Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize, tmpinit); + } + *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) + | Bs[3]; + for (uint32_t i = 0; i < howmany; ++i) { + BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i],init); + out += Bs[i]; + } + } + nvalue = out - initout; + } + + const uint32_t * decodeArray(const uint32_t *in, const size_t /*length*/, + uint32_t *out, size_t & nvalue) { + const uint32_t actuallength = *in++; + checkifdivisibleby(actuallength, BlockSize); + const uint32_t * const initout(out); + uint32_t Bs[HowManyMiniBlocks]; + uint32_t init = 0; + for (; out < initout + actuallength /(HowManyMiniBlocks * MiniBlockSize) *HowManyMiniBlocks * MiniBlockSize + ; out += HowManyMiniBlocks * MiniBlockSize) { + Bs[0] = static_cast(in[0] >> 24); + Bs[1] = static_cast(in[0] >> 16); + Bs[2] = static_cast(in[0] >> 8); + Bs[3] = static_cast(in[0]); + ++in; + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i],init); + in += Bs[i]; + } + } + if(out < initout + actuallength) { + size_t howmany = ( initout + actuallength - out ) /MiniBlockSize; + Bs[0] = static_cast(in[0] >> 24); + Bs[1] = static_cast(in[0] >> 16); + Bs[2] = static_cast(in[0] >> 8); + Bs[3] = static_cast(in[0]); + ++in; + for (uint32_t i = 0; i < howmany; ++i) { + BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i],init); + in += Bs[i]; + } + out += howmany * MiniBlockSize; + + } + nvalue = out - initout; + return in; + } + + string name() const { + ostringstream convert; + convert << "BinaryPacking"<<"With"< + +void __fastunpack0(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack1(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack2(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack3(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack4(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack5(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack6(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack7(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack9(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack10(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack11(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack12(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack13(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack14(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack15(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack17(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack18(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack19(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack20(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack21(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack22(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack23(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack25(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack26(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack27(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack28(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack29(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack30(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack31(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastunpack32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); + + +void __fastpack0(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack1(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack2(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack3(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack4(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack5(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack6(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack7(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack9(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack10(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack11(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack12(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack13(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack14(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack15(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack17(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack18(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack19(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack20(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack21(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack22(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack23(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack25(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack26(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack27(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack28(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack29(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack30(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack31(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpack32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); + +void __fastpackwithoutmask0(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask1(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask2(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask3(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask4(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask5(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask6(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask7(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask9(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask10(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask11(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask12(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask13(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask14(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask15(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask17(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask18(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask19(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask20(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask21(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask22(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask23(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask25(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask26(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask27(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask28(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask29(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask30(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask31(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __fastpackwithoutmask32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out); + +#endif // BITPACKING diff --git a/include/bitpackinghelpers.h b/include/bitpackinghelpers.h new file mode 100644 index 0000000..7d636d6 --- /dev/null +++ b/include/bitpackinghelpers.h @@ -0,0 +1,679 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Leonid Boytsov, Nathan Kurz and Daniel Lemire + */ + +#ifndef BITPACKINGHELPERS_H_ +#define BITPACKINGHELPERS_H_ + +#include "bitpacking.h" +#include "integratedbitpacking.h" +#include "delta.h" +#include "util.h" + +struct BitPackingHelpers { + const static unsigned BlockSize = 32; + + static void inline fastunpack(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch(bit) { + case 0: + __fastunpack0(in,out); + break; + case 1: + __fastunpack1(in,out); + break; + case 2: + __fastunpack2(in,out); + break; + case 3: + __fastunpack3(in,out); + break; + case 4: + __fastunpack4(in,out); + break; + case 5: + __fastunpack5(in,out); + break; + case 6: + __fastunpack6(in,out); + break; + case 7: + __fastunpack7(in,out); + break; + case 8: + __fastunpack8(in,out); + break; + case 9: + __fastunpack9(in,out); + break; + case 10: + __fastunpack10(in,out); + break; + case 11: + __fastunpack11(in,out); + break; + case 12: + __fastunpack12(in,out); + break; + case 13: + __fastunpack13(in,out); + break; + case 14: + __fastunpack14(in,out); + break; + case 15: + __fastunpack15(in,out); + break; + case 16: + __fastunpack16(in,out); + break; + case 17: + __fastunpack17(in,out); + break; + case 18: + __fastunpack18(in,out); + break; + case 19: + __fastunpack19(in,out); + break; + case 20: + __fastunpack20(in,out); + break; + case 21: + __fastunpack21(in,out); + break; + case 22: + __fastunpack22(in,out); + break; + case 23: + __fastunpack23(in,out); + break; + case 24: + __fastunpack24(in,out); + break; + case 25: + __fastunpack25(in,out); + break; + case 26: + __fastunpack26(in,out); + break; + case 27: + __fastunpack27(in,out); + break; + case 28: + __fastunpack28(in,out); + break; + case 29: + __fastunpack29(in,out); + break; + case 30: + __fastunpack30(in,out); + break; + case 31: + __fastunpack31(in,out); + break; + case 32: + __fastunpack32(in,out); + break; + default: + break; + } + } + + + + static void inline fastpack(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch(bit) { + case 0: + __fastpack0(in,out); + break; + case 1: + __fastpack1(in,out); + break; + case 2: + __fastpack2(in,out); + break; + case 3: + __fastpack3(in,out); + break; + case 4: + __fastpack4(in,out); + break; + case 5: + __fastpack5(in,out); + break; + case 6: + __fastpack6(in,out); + break; + case 7: + __fastpack7(in,out); + break; + case 8: + __fastpack8(in,out); + break; + case 9: + __fastpack9(in,out); + break; + case 10: + __fastpack10(in,out); + break; + case 11: + __fastpack11(in,out); + break; + case 12: + __fastpack12(in,out); + break; + case 13: + __fastpack13(in,out); + break; + case 14: + __fastpack14(in,out); + break; + case 15: + __fastpack15(in,out); + break; + case 16: + __fastpack16(in,out); + break; + case 17: + __fastpack17(in,out); + break; + case 18: + __fastpack18(in,out); + break; + case 19: + __fastpack19(in,out); + break; + case 20: + __fastpack20(in,out); + break; + case 21: + __fastpack21(in,out); + break; + case 22: + __fastpack22(in,out); + break; + case 23: + __fastpack23(in,out); + break; + case 24: + __fastpack24(in,out); + break; + case 25: + __fastpack25(in,out); + break; + case 26: + __fastpack26(in,out); + break; + case 27: + __fastpack27(in,out); + break; + case 28: + __fastpack28(in,out); + break; + case 29: + __fastpack29(in,out); + break; + case 30: + __fastpack30(in,out); + break; + case 31: + __fastpack31(in,out); + break; + case 32: + __fastpack32(in,out); + break; + default: + break; + } + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + static void inline fastpackwithoutmask(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch(bit) { + case 0: + __fastpackwithoutmask0(in,out); + break; + case 1: + __fastpackwithoutmask1(in,out); + break; + case 2: + __fastpackwithoutmask2(in,out); + break; + case 3: + __fastpackwithoutmask3(in,out); + break; + case 4: + __fastpackwithoutmask4(in,out); + break; + case 5: + __fastpackwithoutmask5(in,out); + break; + case 6: + __fastpackwithoutmask6(in,out); + break; + case 7: + __fastpackwithoutmask7(in,out); + break; + case 8: + __fastpackwithoutmask8(in,out); + break; + case 9: + __fastpackwithoutmask9(in,out); + break; + case 10: + __fastpackwithoutmask10(in,out); + break; + case 11: + __fastpackwithoutmask11(in,out); + break; + case 12: + __fastpackwithoutmask12(in,out); + break; + case 13: + __fastpackwithoutmask13(in,out); + break; + case 14: + __fastpackwithoutmask14(in,out); + break; + case 15: + __fastpackwithoutmask15(in,out); + break; + case 16: + __fastpackwithoutmask16(in,out); + break; + case 17: + __fastpackwithoutmask17(in,out); + break; + case 18: + __fastpackwithoutmask18(in,out); + break; + case 19: + __fastpackwithoutmask19(in,out); + break; + case 20: + __fastpackwithoutmask20(in,out); + break; + case 21: + __fastpackwithoutmask21(in,out); + break; + case 22: + __fastpackwithoutmask22(in,out); + break; + case 23: + __fastpackwithoutmask23(in,out); + break; + case 24: + __fastpackwithoutmask24(in,out); + break; + case 25: + __fastpackwithoutmask25(in,out); + break; + case 26: + __fastpackwithoutmask26(in,out); + break; + case 27: + __fastpackwithoutmask27(in,out); + break; + case 28: + __fastpackwithoutmask28(in,out); + break; + case 29: + __fastpackwithoutmask29(in,out); + break; + case 30: + __fastpackwithoutmask30(in,out); + break; + case 31: + __fastpackwithoutmask31(in,out); + break; + case 32: + __fastpackwithoutmask32(in,out); + break; + default: + break; + } + } + + + static void inline integratedfastunpack(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch(bit) { + case 0: + __integratedfastunpack0(initoffset,in,out); + break; + case 1: + __integratedfastunpack1(initoffset,in,out); + break; + case 2: + __integratedfastunpack2(initoffset,in,out); + break; + case 3: + __integratedfastunpack3(initoffset,in,out); + break; + case 4: + __integratedfastunpack4(initoffset,in,out); + break; + case 5: + __integratedfastunpack5(initoffset,in,out); + break; + case 6: + __integratedfastunpack6(initoffset,in,out); + break; + case 7: + __integratedfastunpack7(initoffset,in,out); + break; + case 8: + __integratedfastunpack8(initoffset,in,out); + break; + case 9: + __integratedfastunpack9(initoffset,in,out); + break; + case 10: + __integratedfastunpack10(initoffset,in,out); + break; + case 11: + __integratedfastunpack11(initoffset,in,out); + break; + case 12: + __integratedfastunpack12(initoffset,in,out); + break; + case 13: + __integratedfastunpack13(initoffset,in,out); + break; + case 14: + __integratedfastunpack14(initoffset,in,out); + break; + case 15: + __integratedfastunpack15(initoffset,in,out); + break; + case 16: + __integratedfastunpack16(initoffset,in,out); + break; + case 17: + __integratedfastunpack17(initoffset,in,out); + break; + case 18: + __integratedfastunpack18(initoffset,in,out); + break; + case 19: + __integratedfastunpack19(initoffset,in,out); + break; + case 20: + __integratedfastunpack20(initoffset,in,out); + break; + case 21: + __integratedfastunpack21(initoffset,in,out); + break; + case 22: + __integratedfastunpack22(initoffset,in,out); + break; + case 23: + __integratedfastunpack23(initoffset,in,out); + break; + case 24: + __integratedfastunpack24(initoffset,in,out); + break; + case 25: + __integratedfastunpack25(initoffset,in,out); + break; + case 26: + __integratedfastunpack26(initoffset,in,out); + break; + case 27: + __integratedfastunpack27(initoffset,in,out); + break; + case 28: + __integratedfastunpack28(initoffset,in,out); + break; + case 29: + __integratedfastunpack29(initoffset,in,out); + break; + case 30: + __integratedfastunpack30(initoffset,in,out); + break; + case 31: + __integratedfastunpack31(initoffset,in,out); + break; + case 32: + __integratedfastunpack32(initoffset,in,out); + break; + default: + break; + } + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + static void inline integratedfastpackwithoutmask(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch(bit) { + case 0: + __integratedfastpack0(initoffset,in,out); + break; + case 1: + __integratedfastpack1(initoffset,in,out); + break; + case 2: + __integratedfastpack2(initoffset,in,out); + break; + case 3: + __integratedfastpack3(initoffset,in,out); + break; + case 4: + __integratedfastpack4(initoffset,in,out); + break; + case 5: + __integratedfastpack5(initoffset,in,out); + break; + case 6: + __integratedfastpack6(initoffset,in,out); + break; + case 7: + __integratedfastpack7(initoffset,in,out); + break; + case 8: + __integratedfastpack8(initoffset,in,out); + break; + case 9: + __integratedfastpack9(initoffset,in,out); + break; + case 10: + __integratedfastpack10(initoffset,in,out); + break; + case 11: + __integratedfastpack11(initoffset,in,out); + break; + case 12: + __integratedfastpack12(initoffset,in,out); + break; + case 13: + __integratedfastpack13(initoffset,in,out); + break; + case 14: + __integratedfastpack14(initoffset,in,out); + break; + case 15: + __integratedfastpack15(initoffset,in,out); + break; + case 16: + __integratedfastpack16(initoffset,in,out); + break; + case 17: + __integratedfastpack17(initoffset,in,out); + break; + case 18: + __integratedfastpack18(initoffset,in,out); + break; + case 19: + __integratedfastpack19(initoffset,in,out); + break; + case 20: + __integratedfastpack20(initoffset,in,out); + break; + case 21: + __integratedfastpack21(initoffset,in,out); + break; + case 22: + __integratedfastpack22(initoffset,in,out); + break; + case 23: + __integratedfastpack23(initoffset,in,out); + break; + case 24: + __integratedfastpack24(initoffset,in,out); + break; + case 25: + __integratedfastpack25(initoffset,in,out); + break; + case 26: + __integratedfastpack26(initoffset,in,out); + break; + case 27: + __integratedfastpack27(initoffset,in,out); + break; + case 28: + __integratedfastpack28(initoffset,in,out); + break; + case 29: + __integratedfastpack29(initoffset,in,out); + break; + case 30: + __integratedfastpack30(initoffset,in,out); + break; + case 31: + __integratedfastpack31(initoffset,in,out); + break; + case 32: + __integratedfastpack32(initoffset,in,out); + break; + default: + break; + } + } + + + + static void inline ipackwithoutmask(const uint32_t * in, const size_t Qty, uint32_t * out, const uint32_t bit ) { + if (Qty % BlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + uint32_t initoffset = 0; + + for(size_t k = 0; k < Qty/BlockSize; ++k) { + integratedfastpackwithoutmask(initoffset,in+k*BlockSize,out+k*bit,bit); + initoffset = *(in+k*BlockSize+BlockSize - 1); + } + } + + + static void inline pack(uint32_t * in, const size_t Qty, uint32_t * out, const uint32_t bit ) { + if (Qty % BlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + uint32_t initoffset = 0; + + for(size_t k = 0; k < Qty/BlockSize; ++k) { + const uint32_t nextoffset = *(in+k*BlockSize+BlockSize - 1); + if(bit<32) delta(initoffset,in+k*BlockSize); + fastpack(in+k*BlockSize,out+k*bit,bit); + initoffset = nextoffset; + } + } + + static void inline packWithoutDelta(uint32_t * in, const size_t Qty, uint32_t * out, const uint32_t bit ) { + for(size_t k = 0; k < Qty/BlockSize; ++k) { + fastpack(in+k*BlockSize,out+k*bit,bit); + } + } + + static void inline unpack(const uint32_t * in, const size_t Qty, uint32_t * out, const uint32_t bit ) { + if (Qty % BlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + uint32_t initoffset = 0; + + for(size_t k = 0; k < Qty/BlockSize; ++k) { + fastunpack(in+k*bit,out+k*BlockSize,bit); + if(bit<32) inverseDelta(initoffset,out+k*BlockSize); + initoffset = *(out+k*BlockSize+BlockSize - 1); + } + } + + static void inline unpackWithoutDelta(const uint32_t * in, const size_t Qty, uint32_t * out, const uint32_t bit ) { + for(size_t k = 0; k < Qty/BlockSize; ++k) { + fastunpack(in+k*bit,out+k*BlockSize,bit); + } + } + + + static void inline packwithoutmask(uint32_t * in, const size_t Qty, uint32_t * out, const uint32_t bit ) { + if (Qty % BlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + uint32_t initoffset = 0; + + for(size_t k = 0; k < Qty/BlockSize; ++k) { + const uint32_t nextoffset = *(in+k*BlockSize+BlockSize - 1); + if(bit<32) delta(initoffset,in+k*BlockSize); + fastpackwithoutmask(in+k*BlockSize,out+k*bit,bit); + initoffset = nextoffset; + } + } + + + static void inline packwithoutmaskWithoutDelta(uint32_t * in, const size_t Qty, uint32_t * out, const uint32_t bit ) { + for(size_t k = 0; k < Qty/BlockSize; ++k) { + fastpackwithoutmask(in+k*BlockSize,out+k*bit,bit); + } + } + + + static void inline iunpack(const uint32_t * in, const size_t Qty, uint32_t * out, const uint32_t bit ) { + if (Qty % BlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + + uint32_t initoffset = 0; + for(size_t k = 0; k < Qty/BlockSize; ++k) { + integratedfastunpack(initoffset,in+k*bit,out+k*BlockSize,bit); + initoffset = *(out+k*BlockSize+BlockSize - 1); + } + } + + + /*static void GenRandom(std::vector& data, int b) { + data[0] = random(b); + + for(size_t i = 1 ; i < data.size() ; ++i ) + data[i] = random(b) + data[i-1]; + }*/ + + static void CheckMaxDiff(const std::vector& refdata, unsigned bit) { + for(size_t i = 1; i < refdata.size(); ++i ) { + if(gccbits(refdata[i]-refdata[i-1])>bit) throw std::runtime_error("bug"); + + } + } +}; + + +#endif /* BITPACKINGHELPERS_H_ */ diff --git a/include/boolarray.h b/include/boolarray.h new file mode 100644 index 0000000..198048c --- /dev/null +++ b/include/boolarray.h @@ -0,0 +1,200 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ + +#ifndef BOOLARRAY_H_ +#define BOOLARRAY_H_ + +#include "common.h" + +using namespace std; + + +static inline int numberOfTrailingZeros(uint64_t x) { + if (x == 0) return 64; + return __builtin_ctzl(x); +} + + + + +class BoolArray { +public: + + + vector buffer; + size_t sizeinbits; + BoolArray(const size_t n, const uint64_t initval = 0) : + buffer(n / 64 + (n % 64 == 0 ? 0 : 1), initval), + sizeinbits(n) { + } + + BoolArray() : + buffer(), sizeinbits(0) { + } + + BoolArray(const BoolArray & ba) : + buffer(ba.buffer), sizeinbits(ba.sizeinbits) { + } + + void inplaceIntersect(const BoolArray & other) { + assert(other.buffer.size() == buffer.size()); + for(size_t i = 0; i < buffer.size(); ++i) + buffer[i] &= other.buffer[i]; + } + + // this is no faster because the compiler will vectorize + // inplaceIntersect automagically? + void SIMDinplaceIntersect(const BoolArray & other) { + assert(other.buffer.size() == buffer.size()); + __m128i * bin = reinterpret_cast<__m128i *>(buffer.data()); + const __m128i * bo = reinterpret_cast(other.buffer.data()); + for(size_t i = 0; i < buffer.size()/2; ++i) { + __m128i p1 = _mm_load_si128 (bin + i); + __m128i p2 = _mm_load_si128 (bo + i); + __m128i andp1p2 = _mm_and_si128 (p1,p2); + _mm_storeu_si128 (bin + i, andp1p2); + } + for(size_t i = buffer.size()/2 * 2; i < buffer.size(); ++i) + buffer[i] &= other.buffer[i]; + } + + + void intersect(const BoolArray & other, BoolArray & output) { + assert(other.buffer.size() == buffer.size()); + output.buffer.resize(buffer.size()); + for(size_t i = 0; i < buffer.size(); ++i) + output.buffer[i] = buffer[i] & other.buffer[i]; + } + + + // this is no faster because the compiler will vectorize + // intersect automagically? + void SIMDintersect(const BoolArray & other, BoolArray & output) { + assert(other.buffer.size() == buffer.size()); + output.buffer.resize(buffer.size()); + const __m128i * bin = reinterpret_cast(buffer.data()); + const __m128i * bo = reinterpret_cast(other.buffer.data()); + __m128i * bout = reinterpret_cast<__m128i *>(output.buffer.data()); + + for(size_t i = 0; i < buffer.size()/2; ++i) { + __m128i p1 = _mm_load_si128 (bin + i); + __m128i p2 = _mm_load_si128 (bo + i); + __m128i andp1p2 = _mm_and_si128 (p1,p2); + _mm_storeu_si128 (bout + i, andp1p2); + } + for(size_t i = buffer.size()/2*2; i < buffer.size(); ++i) + output.buffer[i] = buffer[i] & other.buffer[i]; + } + + void setSizeInBits(const size_t sizeib) { + sizeinbits = sizeib; + } + + /** + * Write out this bitmap to a vector as a list of integers corresponding + * to set bits. The caller should have allocated enough memory. + */ + void toArray(vector & ans) { + uint32_t pos = 0; + for (uint32_t k = 0; k < buffer.size(); ++k) { + const uint64_t myword = buffer[k]; + for(int offset = 0; offset<64;++offset) { + if((myword >> offset) == 0) break; + offset+=numberOfTrailingZeros((myword >> offset)); + ans[pos++]=64 * k + offset; + } + } + ans.resize(pos); + } + + + /** + * This is a version of toArray where we write to a pointer. + * Returns the number of written ints. + */ + size_t toInts(uint32_t * out) { + size_t pos = 0; + for (uint32_t k = 0; k < buffer.size(); ++k) { + const uint64_t myword = buffer[k]; + for(int offset = 0; offset<64;++offset) { + if((myword >> offset) == 0) break; + offset+=numberOfTrailingZeros((myword >> offset)); + out[pos++]=64 * k + offset; + } + } + return pos; + } + BoolArray& operator=(const BoolArray & x) { + this->buffer = x.buffer; + this->sizeinbits = x.sizeinbits; + return *this; + } + + /** + * set to true (whether it was already set to true or not) + * + * This is an expensive (random access) API, you really ought to + * prepare a new word and then append it. + */ + __attribute__((always_inline)) + inline void set(const size_t pos) { + buffer[pos / 64] |= (static_cast (1) << (pos + % 64)); + } + + /** + * set to false (whether it was already set to false or not) + * + * This is an expensive (random access) API, you really ought to + * prepare a new word and then append it. + */ + __attribute__((always_inline)) + inline void unset(const size_t pos) { + buffer[pos / 64] |= ~(static_cast (1) << (pos + % 64)); + } + + /** + * true of false? (set or unset) + */ + __attribute__((always_inline)) + inline bool get(const size_t pos) const { + return (buffer[pos / 64] & (static_cast (1) << (pos + % 64))) != 0; + } + + /** + * set all bits to 0 + */ + void reset() { + memset(buffer.data(), 0, sizeof(uint64_t) * buffer.size());//memset can be slow, does it matter? + sizeinbits = 0; + } + + size_t sizeInBits() const { + return sizeinbits; + } + + size_t sizeInBytes() const { + return buffer.size() * sizeof(uint64_t); + } + + /** + * Return memory usage of a bitmap spanning n bits + */ + static size_t sizeInBytes(size_t n) { + size_t buffersize = n / 64 + (n % 64 == 0 ? 0 : 1); + return buffersize * sizeof(uint64_t); + } + + ~BoolArray() { + } + + +}; + + +#endif /* BOOLARRAY_H_ */ diff --git a/include/codecfactory.h b/include/codecfactory.h new file mode 100644 index 0000000..903766c --- /dev/null +++ b/include/codecfactory.h @@ -0,0 +1,160 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#ifndef CODECFACTORY_H_ +#define CODECFACTORY_H_ + +#include "common.h" +#include "codecs.h" +#include "common.h" +#include "compositecodec.h" +#include "bitpackinghelpers.h" +#include "simdbitpackinghelpers.h" +#include "delta.h" +#include "util.h" +#include "synthetic.h" +#include "binarypacking.h" +#include "simdbinarypacking.h" +#include "fastpfor.h" +#include "simdfastpfor.h" +#include "variablebyte.h" + +using namespace std; + +typedef VariableByte leftovercodec; + +static std::map > initializefactory() { + std::map < string, shared_ptr > schemes; + + schemes["fastpfor"] = shared_ptr ( + new CompositeCodec , + leftovercodec > ()); + + + schemes["copy"] = shared_ptr (new JustCopy()); + + schemes["varint"] = shared_ptr (new VariableByte ()); + schemes["s-fastpfor-4"] = shared_ptr ( + new CompositeCodec , leftovercodec > ()); + + schemes["s-fastpfor-m"] + = shared_ptr ( + new CompositeCodec , + VariableByte > ()); + schemes["s-fastpfor-1"] = shared_ptr ( + new CompositeCodec , leftovercodec > ()); + schemes["s-fastpfor-2"] = shared_ptr ( + new CompositeCodec , leftovercodec > ()); + + schemes["bp32"] = shared_ptr ( + new CompositeCodec , VariableByte< + true> > ()); + schemes["ibp32"] = shared_ptr ( + new CompositeCodec , + leftovercodec > ()); + + schemes["s-bp128+d1"] = shared_ptr ( + new CompositeCodec < SIMDBinaryPacking > , leftovercodec > ()); + schemes["s-bp128+d2"] = shared_ptr ( + new CompositeCodec < SIMDBinaryPacking >, leftovercodec > ()); + schemes["s-bp128+d4"] = shared_ptr ( + new CompositeCodec < SIMDBinaryPacking >, leftovercodec > ()); + schemes["s-bp128+dm"] = shared_ptr ( + new CompositeCodec < SIMDBinaryPacking >, leftovercodec > ()); + + schemes["s-bp128-1"] = shared_ptr ( + new CompositeCodec < SIMDBinaryPacking >, leftovercodec > ()); + schemes["s-bp128-2"] = shared_ptr ( + new CompositeCodec < SIMDBinaryPacking >, leftovercodec > ()); + schemes["s-bp128-4"] = shared_ptr ( + new CompositeCodec < SIMDBinaryPacking >, leftovercodec > ()); + schemes["s-bp128-m"] = shared_ptr ( + new CompositeCodec < SIMDBinaryPacking >, leftovercodec > ()); + + return schemes; +} + + + +class CODECFactory { +public: + static map > scodecmap; + static shared_ptr defaultptr; + + // hacked for convenience + static vector > allSchemes() { + vector < shared_ptr > ans; + for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { + ans.push_back(i->second); + } + return ans; + } + + static vector allNames() { + vector < string > ans; + for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { + ans.push_back(i->first); + } + return ans; + } + + /** + * This function tries to determine whether the + * input is modified during compression. + */ + static bool modifiesInputDuringCompression(IntegerCODEC & v) { + vector test; + const uint32_t N = 2049; + for(uint32_t k = 0; k < N; ++k) + test.emplace_back(k); + vector out(N+1024); + size_t outsize = out.size(); + v.encodeArray(test.data(),N,out.data(),outsize); + for(uint32_t k = 0; k < N; ++k) + if(test[k] != k) return true; + return false; // granted this is not full-proof, but is ok in our context + } + + static string getName(IntegerCODEC & v) { + for(auto i = scodecmap.begin(); i != scodecmap.end() ; ++i) { + if(i->second.get() == &v ) + return i->first; + } + return "UNKNOWN"; + } + + static bool valid(string name) { + return (scodecmap.find(name) != scodecmap.end()) ; + } + + static shared_ptr & getFromName(string name) { + if (scodecmap.find(name) == scodecmap.end()) { + cerr << "name " << name << " does not refer to a CODEC." << endl; + cerr << "possible choices:" << endl; + for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { + cerr << static_cast (i->first) << endl;// useless cast, but just to be clear + } + return defaultptr; + } + return scodecmap[name]; + } + +}; + +map > CODECFactory::scodecmap = + initializefactory(); + +shared_ptr CODECFactory::defaultptr = shared_ptr(nullptr); +#endif /* CODECFACTORY_H_ */ diff --git a/include/codecs.h b/include/codecs.h new file mode 100644 index 0000000..02c810b --- /dev/null +++ b/include/codecs.h @@ -0,0 +1,136 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#ifndef CODECS_H_ +#define CODECS_H_ + + +#include "common.h" +#include "util.h" +#include "bitpackinghelpers.h" + +using namespace std; + +class NotEnoughStorage: public std::runtime_error { +public: + size_t required;// number of 32-bit symbols required + NotEnoughStorage(const size_t req) : + runtime_error(""), required(req) { + + } +}; + +class IntegerCODEC { +public: + + /** + * You specify input and input length, as well as + * output and output length. nvalue gets modified to + * reflect how much was used. If the new value of + * nvalue is more than the original value, we can + * consider this a buffer overrun. + * + * You are responsible for allocating the memory (length + * for *in and nvalue for *out). + */ + virtual void encodeArray(uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) = 0; + + /** + * Usage is similar to encodeArray except that it returns a pointer + * incremented from in. In theory it should be in+length. If the + * returned pointer is less than in+length, then this generally means + * that the decompression is not finished (some scheme compress + * the bulk of the data one way, and they then they compress remaining + * integers using another scheme). + * + * As with encodeArray, you need to have length element allocated + * for *in and at least nvalue elements allocated for out. The value + * of the variable nvalue gets updated with the number actually use + * (if nvalue exceeds the original value, there might be a buffer + * overrun). + */ + virtual const uint32_t * decodeArray(const uint32_t *in, + const size_t length, uint32_t *out, size_t &nvalue)= 0; + virtual ~IntegerCODEC() { + } + + /** + * Will compress the content of a vector into + * another vector. + * + * This is offered for convenience. It might be slow. + */ + virtual vector compress(vector & data) { + vector < uint32_t > compresseddata(data.size() * 2 + 1024);// allocate plenty of memory + size_t memavailable = compresseddata.size(); + encodeArray(data.data(), data.size(), compresseddata.data(), memavailable); + compresseddata.resize(memavailable); + return compresseddata; + } + + /** + * Will uncompress the content of a vector into + * another vector. Some CODECs know exactly how much data to uncompress, + * others need to uncompress it all to know how data there is to uncompress... + * So it useful to have a hint (expected_uncompressed_size) that tells how + * much data there will be to uncompress. Otherwise, the code will + * try to guess, but the result is uncertain and inefficient. You really + * ought to keep track of how many symbols you had compressed. + * + * For convenience. Might be slow. + */ + virtual vector uncompress( + vector & compresseddata, + size_t expected_uncompressed_size = 0) { + vector < uint32_t > data(expected_uncompressed_size);// allocate plenty of memory + size_t memavailable = data.size(); + try { + decodeArray(compresseddata.data(), compresseddata.size(), data.data(), + memavailable); + } catch (NotEnoughStorage & nes) { + data.resize(nes.required + 1024); + decodeArray(compresseddata.data(), compresseddata.size(), data.data(), + memavailable); + + } + data.resize(memavailable); + return data; + } + + virtual string name() const = 0; +}; + +/****************** + * This just copies the data, no compression. + */ +class JustCopy: public IntegerCODEC { +public: + void encodeArray(uint32_t * in, const size_t length, uint32_t * out, + size_t &nvalue) { + memcpy(out, in, sizeof(uint32_t) * length); + nvalue = length; + } + // like encodeArray, but we don't actually copy + void fakeencodeArray(const uint32_t * /*in*/, const size_t length, + size_t &nvalue) { + nvalue = length; + } + + const uint32_t * decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t & nvalue) { + memcpy(out, in, sizeof(uint32_t) * length); + nvalue = length; + return in + length; + } + string name() const { + return "JustCopy"; + } +}; + + +#endif /* CODECS_H_ */ diff --git a/include/common.h b/include/common.h new file mode 100644 index 0000000..5ed493c --- /dev/null +++ b/include/common.h @@ -0,0 +1,46 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef COMMON_H_ +#define COMMON_H_ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#endif /* COMMON_H_ */ diff --git a/include/compositecodec.h b/include/compositecodec.h new file mode 100644 index 0000000..c65fb7f --- /dev/null +++ b/include/compositecodec.h @@ -0,0 +1,68 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef COMPOSITECODEC_H_ +#define COMPOSITECODEC_H_ + +#include "common.h" +#include "util.h" +#include "codecs.h" + +/** + * This is a useful class for CODEC that only compress + * data having length a multiple of some unit length. + */ +template +class CompositeCodec: public IntegerCODEC { +public: + CompositeCodec() : + codec1(), codec2() { + } + Codec1 codec1; + Codec2 codec2; + void encodeArray(uint32_t * in, const size_t length, uint32_t * out, + size_t &nvalue) { + const size_t roundedlength = length / Codec1::BlockSize + * Codec1::BlockSize; + size_t nvalue1 = nvalue; + codec1.encodeArray(in, roundedlength, out, nvalue1); + + if (roundedlength < length) { + ASSERT(nvalue >= nvalue1, nvalue << " " << nvalue1); + size_t nvalue2 = nvalue - nvalue1; + codec2.encodeArray(in + roundedlength, length - roundedlength, + out + nvalue1, nvalue2); + nvalue = nvalue1 + nvalue2; + } else { + nvalue = nvalue1; + } + } + const uint32_t * decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t & nvalue) { + const uint32_t * const initin(in); + size_t mynvalue1 = nvalue; + const uint32_t *in2 = codec1.decodeArray(in, length, out, mynvalue1); + if (length + in > in2) { + assert(nvalue > mynvalue1); + size_t nvalue2 = nvalue - mynvalue1; + const uint32_t *in3 = codec2.decodeArray(in2, length - (in2 - in), + out + mynvalue1, nvalue2); + nvalue = mynvalue1 + nvalue2; + assert(initin + length >= in3); + return in3; + } + nvalue = mynvalue1; + assert(initin + length >= in2); + return in2; + } + string name() const { + ostringstream convert; + convert << codec1.name() << "+" << codec2.name(); + return convert.str(); + } +}; + +#endif /* COMPOSITECODEC_H_ */ diff --git a/include/delta.h b/include/delta.h new file mode 100644 index 0000000..e615cce --- /dev/null +++ b/include/delta.h @@ -0,0 +1,83 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Leonid Boytsov, Nathan Kurz and Daniel Lemire + */ + +#ifndef DELTA_H_ +#define DELTA_H_ + + +#include "common.h" + +/** + * To avoid crazy dependencies, this header should not + * include any other header file. + */ + + +template +void delta(const T initoffset, T * data, const size_t size) { + if(size == 0) return; // nothing to do + if(size > 1) + for (size_t i = size - 1; i > 0; --i) { + data[i] -= data[i - 1]; + } + data[0] -= initoffset; +} + +template +void delta(const T initoffset, T * data) { + if(size == 0) return; // nothing to do + if(size > 1) + for (size_t i = size - 1; i > 0; --i) { + data[i] -= data[i - 1]; + } + data[0] -= initoffset; +} + + +template +void inverseDelta(const T initoffset, T * data, const size_t size) { + if (size == 0) return; // nothing to do + data[0] += initoffset; + const size_t UnrollQty = 4; + const size_t sz0 = (size / UnrollQty) * UnrollQty; // equal to 0, if size < UnrollQty + size_t i = 1; + if (sz0 >= UnrollQty) { + T a = data[0]; + for (; i < sz0 - UnrollQty; i += UnrollQty) { + a = data[i] += a; + a = data[i + 1] += a; + a = data[i + 2] += a; + a = data[i + 3] += a; + } + } + for (; i != size; ++i) { + data[i] += data[i - 1]; + } +} +template +void inverseDelta(const T initoffset, T * data) { + if (size == 0) return; // nothing to do + data[0] += initoffset; + const size_t UnrollQty = 4; + const size_t sz0 = (size / UnrollQty) * UnrollQty; // equal to 0, if size < UnrollQty + size_t i = 1; + if (sz0 >= UnrollQty) { + T a = data[0]; + for (; i < sz0 - UnrollQty; i += UnrollQty) { + a = data[i] += a; + a = data[i + 1] += a; + a = data[i + 2] += a; + a = data[i + 3] += a; + } + } + for (; i != size; ++i) { + data[i] += data[i - 1]; + } +} + + +#endif /* DELTA_H_ */ diff --git a/include/deltatemplates.h b/include/deltatemplates.h new file mode 100644 index 0000000..c49a66a --- /dev/null +++ b/include/deltatemplates.h @@ -0,0 +1,164 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Leonid Boytsov, Nathan Kurz and Daniel Lemire + */ + +#ifndef DELTATEMPLATES_H_ +#define DELTATEMPLATES_H_ + +#include "common.h" + +/** + * To avoid crazy dependencies, this header should not + * include any other header file. + */ + +/** + * The structs RegularDeltaSIMD, NoDelta, CoarseDelta4SIMD, CoarseDelta2SIMD, Max4DeltaSIMD + * are used in templates to specify which type of differential encoding to use (if any). + * + * See SIMDDeltaProcessor + */ + +struct RegularDeltaSIMD { + // Folklore code, unknown origin of this idea + __attribute__((always_inline)) + static inline __m128i PrefixSum(__m128i curr, __m128i prev) { + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); + return _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); + } + + __attribute__((always_inline)) + static inline __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); + } + + static bool usesDifferentialEncoding() { return true; } + + static std::string name() { return "Delta1"; } +}; + +struct NoDelta { + __attribute__((always_inline)) + static inline __m128i PrefixSum(__m128i curr, __m128i ) { + return curr; + } + __attribute__((always_inline)) + static inline __m128i Delta(__m128i curr, __m128i ) { + return curr; + } + + static bool usesDifferentialEncoding() { return false; } + static std::string name() { return "NoDelta"; } +}; + +struct CoarseDelta4SIMD { + __attribute__((always_inline)) + // Proposed and implemented by L. Boytosv + static inline __m128i PrefixSum(__m128i curr, __m128i prev) { + return _mm_add_epi32(curr, prev); + } + __attribute__((always_inline)) + static inline __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, prev); + } + + static bool usesDifferentialEncoding() { return true; } + + static std::string name() { return "Delta4"; } +}; + +struct CoarseDelta2SIMD { + __attribute__((always_inline)) + // Proposed and implemented by L. Boytosv + static inline __m128i PrefixSum(__m128i curr, __m128i prev) { + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); + return _mm_add_epi32(_tmp1, _mm_shuffle_epi32(prev, _MM_SHUFFLE(3,2,3,2))); + } + __attribute__((always_inline)) + static inline __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, _mm_or_si128(_mm_slli_si128(curr, 8), _mm_srli_si128(prev, 8))); + } + static bool usesDifferentialEncoding() { return true; } + + static std::string name() { return "Delta2"; } +}; + +struct Max4DeltaSIMD { + __attribute__((always_inline)) + // The idea is due to N. Kurz + static inline __m128i PrefixSum(__m128i curr, __m128i prev) { + return _mm_add_epi32(curr, _mm_shuffle_epi32(prev, 0xff)); + } + __attribute__((always_inline)) + static inline __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, _mm_shuffle_epi32(prev, 0xff)); + } + static std::string name() { return "DeltaM4"; } + + static bool usesDifferentialEncoding() { return true; } + +}; + + + +/** + * Wrapper around the structs RegularDeltaSIMD, NoDelta, CoarseDelta4SIMD, CoarseDelta2SIMD, Max4DeltaSIMD + * to compute differential encoding and prefix sums. + */ +template +struct SIMDDeltaProcessor { + static __m128i runPrefixSum(__m128i initOffset, uint32_t * pData) { + const size_t QtyDivBy4 = TotalQty / 4; + // The block should contain 8N 32-bit integers, where N is some integer + assert(QtyDivBy4 % 2 == 0); + + __m128i* pCurr = reinterpret_cast<__m128i*>(pData); + const __m128i* pEnd = pCurr + QtyDivBy4; + + // Leonid Boytsov: manual loop unrolling may be crucial here. + while (pCurr < pEnd) { + initOffset = DeltaHelper::PrefixSum(_mm_load_si128(pCurr), initOffset); + _mm_store_si128(pCurr++, initOffset); + + initOffset = DeltaHelper::PrefixSum(_mm_load_si128(pCurr), initOffset); + _mm_store_si128(pCurr++, initOffset); + } + + return initOffset; + } + + static void runDelta(__m128i initOffset, uint32_t * pData) { + const size_t QtyDivBy4 = TotalQty / 4; + // The block should contain 8N 32-bit integers, where N is some integer + assert(QtyDivBy4 && QtyDivBy4 % 2 == 0); + __m128i* pCurr = reinterpret_cast<__m128i*>(pData) + QtyDivBy4 - 1; + __m128i* pStart = reinterpret_cast<__m128i*>(pData); + __m128i a = _mm_load_si128(pCurr); + // Leonid Boytsov: manual loop unrolling may be crucial here. + while (pCurr > pStart + 1) { + __m128i b = _mm_load_si128(pCurr - 1); + _mm_store_si128(pCurr, DeltaHelper::Delta(a, b)); + a = b; + --pCurr; + + b = _mm_load_si128(pCurr - 1); + _mm_store_si128(pCurr, DeltaHelper::Delta(a, b)); + a = b; + --pCurr; + } + + __m128i b = _mm_load_si128(pStart); + _mm_store_si128(pStart + 1, DeltaHelper::Delta(a, b)); + a = b; + + _mm_store_si128(pStart , DeltaHelper::Delta(a, initOffset)); + } +}; + + + +#endif /* DELTATEMPLATES_H_ */ diff --git a/include/fastpfor.h b/include/fastpfor.h new file mode 100644 index 0000000..cf82871 --- /dev/null +++ b/include/fastpfor.h @@ -0,0 +1,360 @@ +/* + * This is the non-SIMD version of FastPFOR. + * It is not recommended per se, only provided for + * comparison purposes. + */ + +#ifndef FASTPFOR_H_ +#define FASTPFOR_H_ + + +#include "common.h" +#include "codecs.h" +#include "bitpackinghelpers.h" +#include "util.h" +#include "delta.h" + + + +class ScalarSortedBitPacker { +public: + + enum{DEFAULTSIZE = 128}; + + ScalarSortedBitPacker() { + for(uint32_t i = 0; i < 32;++i) { + data[i] = new uint32_t[DEFAULTSIZE]; + memset(data[i],0,DEFAULTSIZE*sizeof(uint32_t)); + actualsizes[i] = DEFAULTSIZE; + } + clear(); + } + + void reset() { + for(uint32_t i = 0; i < 32;++i) { + delete[] data[i]; + data[i] = new uint32_t[DEFAULTSIZE]; + memset(data[i],0,DEFAULTSIZE*sizeof(uint32_t)); + actualsizes[i] = DEFAULTSIZE; + } + clear(); + } + + ~ScalarSortedBitPacker() { + free(); + } + void free() { + clear(); + for(uint32_t i = 0; i < 32;++i) + if( data[i] != NULL) { + delete[] data[i]; + data[i] = NULL; + actualsizes[i] = 0; + } + } + void directAppend(uint32_t i, uint32_t val) { + data[i][sizes[i]++] = val; + } + + const uint32_t * get(int i) { + return data[i]; + } + + void ensureCapacity(int i, uint32_t datatoadd) { + if(sizes[i]+datatoadd>actualsizes[i]) { + actualsizes[i] = (sizes[i]+datatoadd+127)/128*128*2; + uint32_t * tmp = new uint32_t[actualsizes[i]]; + for(uint32_t j = 0; j< sizes[i];++j) + tmp[j]= data[i][j]; + delete[] data[i]; + data[i] = tmp; + } + } + + void clear() { + for(uint32_t i = 0; i < 32;++i) + sizes[i] = 0;// memset "might" be faster. + } + + uint32_t * write(uint32_t * out) { + uint32_t bitmap = 0; + for (uint32_t k = 0; k < 32; ++k) { + if (sizes[k] != 0) + bitmap |= (1U << k); + } + *(out++) = bitmap; + + for (uint32_t k = 0; k < 32; ++k) { + if (sizes[k] != 0) { + *out = sizes[k]; + out++; + for (uint32_t j = 0; j < sizes[k]; j += 32) { + BitPackingHelpers::fastpackwithoutmask(&data[k][j], out, + k + 1); + out += k + 1; + } + } + } + return out; + } + const uint32_t * read(const uint32_t * in) { + clear(); + const uint32_t bitmap = *(in++); + + for (uint32_t k = 0; k < 32; ++k) { + if ((bitmap & (1U << k)) != 0) { + sizes[k] = *in++; + if(actualsizes[k] +class FastPFor: public IntegerCODEC { +public: + /** + * ps (page size) should be a multiple of BlockSize, any "large" + * value should do. + */ + FastPFor(uint32_t ps = 65536) : + PageSize(ps), bitsPageSize(gccbits(PageSize)), bpacker(), + bytescontainer(PageSize + 3 * PageSize / BlockSize) { + assert(ps / BlockSize * BlockSize == ps); + assert(gccbits(BlockSizeInUnitsOfPackSize * PACKSIZE - 1) <= 8); + } + enum { + BlockSizeInUnitsOfPackSize = 4, + PACKSIZE = 32, + overheadofeachexcept = 8, + overheadduetobits = 8, + overheadduetonmbrexcept = 8, + BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE + }; + + + + const uint32_t PageSize; + const uint32_t bitsPageSize; + ScalarSortedBitPacker bpacker; + vector bytescontainer; + + const uint32_t * decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + const uint32_t * const initin(in); + const size_t mynvalue = *in; + ++in; + if (mynvalue > nvalue) + throw NotEnoughStorage(mynvalue); + nvalue = mynvalue; + const uint32_t * const finalout(out + nvalue); + uint32_t prev = 0; + while (out != finalout) { + size_t thisnvalue(0); + size_t thissize = + static_cast (finalout > PageSize + out ? PageSize + : (finalout - out)); + + __decodeArray(in, thisnvalue, out, thissize,prev); + in += thisnvalue; + out += thissize; + } + assert(initin + length >= in); + bpacker.reset();// if you don't do this, the codec has a "memory". + return in; + } + + /** + * If you save the output and recover it in memory, you are + * responsible to ensure that the alignment is preserved. + * + * The input size (length) should be a multiple of + * BlockSizeInUnitsOfPackSize * PACKSIZE. (This was done + * to simplify slightly the implementation.) + */ + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + checkifdivisibleby(length, BlockSize); + const uint32_t * const initout(out); + const uint32_t * const finalin(in + length); + + *out++ = static_cast(length); + const size_t oldnvalue = nvalue; + nvalue = 1; + uint32_t prev = 0; + while (in != finalin) { + size_t thissize = + static_cast (finalin > PageSize + in ? PageSize + : (finalin - in)); + size_t thisnvalue(0); + __encodeArray(in, thissize, out, thisnvalue,prev); + nvalue += thisnvalue; + out += thisnvalue; + in += thissize; + } + assert(out == nvalue + initout); + if (oldnvalue < nvalue) + cerr << "It is possible we have a buffer overrun. " << endl; + bpacker.reset();// if you don't do this, the buffer has a memory + } + + + void getBestBFromData(const uint32_t * in, uint8_t& bestb, + uint8_t & bestcexcept, uint8_t & maxb) { + uint32_t freqs[33]; + for (uint32_t k = 0; k <= 32; ++k) + freqs[k] = 0; + for (uint32_t k = 0; k < BlockSize; ++k) { + freqs[asmbits(in[k])]++; + } + bestb = 32; + while (freqs[bestb] == 0) + bestb--; + maxb = bestb; + uint32_t bestcost = bestb * BlockSize; + uint32_t cexcept = 0; + bestcexcept = static_cast(cexcept); + for (uint32_t b = bestb - 1; b < 32; --b) { + cexcept += freqs[b + 1]; + uint32_t thiscost = cexcept * overheadofeachexcept + cexcept + * (maxb - b) + b * BlockSize + 8;// the extra 8 is the cost of storing maxbits + if (thiscost < bestcost) { + bestcost = thiscost; + bestb = static_cast(b); + bestcexcept = static_cast(cexcept); + } + } + } + + void __encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t & nvalue, uint32_t & prev) { + uint32_t * const initout = out; // keep track of this + checkifdivisibleby(length, BlockSize); + uint32_t * const headerout = out++; // keep track of this + bpacker.clear(); + uint8_t * bc = bytescontainer.data(); + //out = padTo128bits(out); + //if(needPaddingTo128Bits(in)) throw std::runtime_error("alignment bug"); + for (const uint32_t * const final = in + length; (in + BlockSize + <= final); in += BlockSize) { + uint8_t bestb, bestcexcept, maxb; + if(useDelta) { + uint32_t nextprev = in[BlockSize - 1]; + delta(prev,in,BlockSize); + prev = nextprev; + } + getBestBFromData(in, bestb, bestcexcept, maxb); + *bc++ = bestb; + *bc++ = bestcexcept; + if (bestcexcept > 0) { + *bc++ = maxb; + bpacker.ensureCapacity(maxb - bestb - 1,bestcexcept); + const uint32_t maxval = 1U << bestb; + for (uint32_t k = 0; k < BlockSize; ++k) { + if (in[k] >= maxval) { + bpacker.directAppend(maxb - bestb - 1, in[k] >> bestb); + *bc++ = static_cast(k); + } + } + } + for(size_t k = 0; k < 4; ++k) { + BitPackingHelpers::fastpack(in+k*32,out+k*bestb,bestb); + } + out += 4 * bestb; + } + headerout[0] = static_cast (out - headerout); + const uint32_t bytescontainersize = static_cast(bc - bytescontainer.data()); + *(out++) = bytescontainersize; + memcpy(out, bytescontainer.data(), bytescontainersize); + out += (bytescontainersize + sizeof(uint32_t) - 1) + / sizeof(uint32_t); + const uint32_t * const lastout = bpacker.write(out); + nvalue = lastout - initout; + } + + void __decodeArray(const uint32_t *in, size_t & length, uint32_t *out, + const size_t nvalue, uint32_t & prev) { + const uint32_t * const initin = in; + const uint32_t * const headerin = in++; + const uint32_t wheremeta = headerin[0]; + const uint32_t *inexcept = headerin + wheremeta; + const uint32_t bytesize = *inexcept++; + const uint8_t * bytep = reinterpret_cast (inexcept); + + inexcept += (bytesize + sizeof(uint32_t) - 1) / sizeof(uint32_t); + inexcept = bpacker.read(inexcept); + length = inexcept - initin; + const uint32_t * unpackpointers[32 + 1]; + for (uint32_t k = 1; k <= 32; ++k) { + unpackpointers[k] = bpacker.get(k-1); + } + for (uint32_t run = 0; run < nvalue / BlockSize; ++run, out + += BlockSize) { + const uint8_t b = *bytep++; + const uint8_t cexcept = *bytep++; + for(size_t k = 0; k < 4; ++k) { + BitPackingHelpers::fastunpack(in+k*b,out+k*32,b); + } + in += 4*b; + if (cexcept > 0) { + const uint8_t maxbits = *bytep++; + const uint32_t * vals = unpackpointers[maxbits - b]; + unpackpointers[maxbits - b] += cexcept; + for (uint32_t k = 0; k < cexcept; ++k) { + const uint8_t pos = *(bytep++); + out[pos] |= vals[k] << b; + } + } + if(useDelta) { + inverseDelta(prev, out, BlockSize); + prev = out[BlockSize-1]; + } + } + + assert(in == headerin + wheremeta); + } + + string name() const { + return string("FastPFor")+ (useDelta?"Delta":""); + } + +}; + + + + + + +#endif /* FASTPFOR_H_ */ diff --git a/include/hybm2.h b/include/hybm2.h new file mode 100644 index 0000000..7d23b4a --- /dev/null +++ b/include/hybm2.h @@ -0,0 +1,634 @@ +/* + * This is an implementation of the hyb+m2 method proposed in: + * + * J. S. Culpepper and A. Moffat. Efficient set intersection for + * inverted indexing. ACM Trans. Inf. Syst., 29(1):1:1Ð1:25, Dec. 2010. + * + * Implemented by Daniel Lemire + */ + +#ifndef HYBM2_H_ +#define HYBM2_H_ + +#include "common.h" +#include "codecs.h" +#include "codecfactory.h" +#include "boolarray.h" +#include "intersection.h" +#include "skipping.h" + + +class HybM2 { +public: + + // th = 0 means that we select bitmap as needed + HybM2(IntegerCODEC & c, intersectionfunction inter, uint32_t MaxId, + uint32_t th = 32) : + bitmapmap(), shortlistmap(), mapuncompsizes(), mMaxId(MaxId), + threshold(th), recovbuffer(), codec(c), + Inter(inter) { + } + + /** + * Returns the *uncompressed* size of a given posting list (specified + * by ID). + */ + size_t getSizeInInts(uint32_t postId) { + return mapuncompsizes[postId]; + } + + + /** + * Load an array (data) of length "length" as the posting list corresponding to id postid + * into the data structure. The data will be either converted to a bitmap or compressed. + */ + size_t load(const uint32_t postid, const uint32_t * data, + const uint32_t length) { + if(threshold == 0) + return loadOptimized(postid, data, length); + else if (length * threshold >= mMaxId) + return loadAsBitmap(postid, data, length); + else + return loadAsShortArray(postid, data, length); + } + + /** + * Check whether we have a posting list corresponding to postid + */ + bool hasBeenLoaded(const uint32_t postid) { + return ((shortlistmap.find(postid) != shortlistmap.end()) + || (bitmapmap.find(postid) != bitmapmap.end())); + } + + /** + * Compute the total of the volume of of posting lists + * corresponding to a query. + */ + size_t computeUnpackVolume(const vector & ids) { + size_t answer = 0; + for(uint32_t id : ids) { + answer += mapuncompsizes[id]; + } + return answer; + } + + /** + * Compute the total of the intersection of of posting lists + * corresponding to a query, and output how many integers are + * in + * + * ids: the query as a set of posting ids + * out: write to result + * sizeout: will indicate how many integers were written to out. + * + * return unpack volume defined as the total volume of the posting + * lists that were needed to compute the intersection (which can + * be less than the total volume possible due to early abandoning). + */ + size_t intersect(const vector & ids, uint32_t * out, + size_t & sizeout) { + if (ids.empty()) { + sizeout = 0; + return 0; + } + vector>> > shortlists; + vector>> bitmaps; + //vector bitmapscard; + + for(uint32_t id : ids) { + if(shortlistmap.find(id)!=shortlistmap.end()) + shortlists.push_back(make_pair(mapuncompsizes[id],shortlistmap[id])); + else { + assert(bitmapmap.find(id)!=bitmapmap.end()); + bitmaps.push_back(make_pair(mapuncompsizes[id],bitmapmap[id]));} + } + size_t unpackVolume = 0; + if(shortlists.empty()) { + if(bitmaps.size() == 1) { + sizeout = bitmaps.front().second->toInts(out); + unpackVolume += sizeout; + return unpackVolume; + } + + BoolArray answer(mMaxId); + bitmaps[0].second->intersect(*bitmaps[1].second,answer); + unpackVolume += bitmaps[0].first + bitmaps[1].first; + for(uint32_t i = 2; i < bitmaps.size(); ++i) { + answer.inplaceIntersect(*bitmaps[i].second); + unpackVolume += bitmaps[i].first; + } + sizeout = answer.toInts(out); + return unpackVolume; + } else { + sort(shortlists.begin(),shortlists.end()); + sort(bitmaps.begin(),bitmaps.end()); + codec.decodeArray(shortlists[0].second->data(),shortlists[0].second->size(),out,sizeout); + unpackVolume+=sizeout; + assert(sizeout == shortlists[0].first); + for(uint32_t i = 1; (sizeout>0) && (i < shortlists.size()); ++i) { + size_t thissize = recovbuffer.size(); + codec.decodeArray(shortlists[i].second->data(),shortlists[i].second->size(), + recovbuffer.data(),thissize); + unpackVolume+=thissize; + sizeout = Inter(out,sizeout,recovbuffer.data(),thissize,out); + + } + size_t pos = 0; + for(uint32_t i = 0; (sizeout>0) && (i < bitmaps.size()); ++i) { + unpackVolume+=bitmaps[i].first; + shared_ptr & ba = bitmaps[i].second; + pos = 0; + for(uint32_t i = 0; i < sizeout; ++i) { + if(!ba->get(out[i])) + continue; + else + out[pos++] = out[i]; + } + sizeout = pos; + } + return unpackVolume; + } + } + + ~HybM2() {} + + /** + * Estimate of the volume of data used by this object. + */ + size_t storageInBytes() const { + size_t answer = 0; + for(auto i : bitmapmap) + answer += i.second->sizeInBytes(); + for(auto i : shortlistmap) + answer += i.second->size() * sizeof(uint32_t); + return answer; + } + + size_t sizeOfRecoveryBufferInWords() const { + return recovbuffer.size(); + } +private: + + // load as either a bitmap or a compressed short list + size_t loadOptimized(const uint32_t postid, const uint32_t * data, const uint32_t length) { + if(mapuncompsizes.find(postid)!=mapuncompsizes.end()) return 0; + vector * compressedbuffer = new vector(length+1024); + size_t outlength = compressedbuffer->size(); + vector tmp(data,data+length);// use the buffer because some codecs modify the input + codec.encodeArray(tmp.data(),length,compressedbuffer->data(),outlength); + if(outlength *sizeof(uint32_t) < BoolArray::sizeInBytes(mMaxId)) {// we are good + if(recovbuffer.size() < length) recovbuffer.resize(length); + compressedbuffer->resize(outlength); + compressedbuffer->shrink_to_fit(); + shortlistmap[postid]=shared_ptr>(compressedbuffer); + mapuncompsizes[postid] = length; + return compressedbuffer->size(); + } else { + delete compressedbuffer; + return loadAsBitmap(postid, data, length); + } + } + /** + * Load an array (data) of length "length" as the posting list corresponding to id postid + * as a bitmap. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsBitmap(const uint32_t postid, const uint32_t * data, const uint32_t length) { + if(bitmapmap.find(postid)!=bitmapmap.end()) return 0; + BoolArray * ba = new BoolArray(mMaxId); + for(uint32_t k = 0; k < length; ++k) + ba->set(data[k]); + bitmapmap[postid] = shared_ptr(ba); + mapuncompsizes[postid] = length; + return ba->sizeInBytes()/sizeof(uint32_t); + } + + /** + * Load an array (data) of length "length" as the posting list corresponding to id postid + * as a short array. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsShortArray(const uint32_t postid, const uint32_t * data, const uint32_t length) { + if(shortlistmap.find(postid)!=shortlistmap.end()) return 0; + if(recovbuffer.size() * compressedbuffer = new vector(length+1024); + size_t outlength = compressedbuffer->size(); + for(size_t i = 0; idata(),outlength); + compressedbuffer->resize(outlength); + compressedbuffer->shrink_to_fit(); + shortlistmap[postid]=shared_ptr>(compressedbuffer); + mapuncompsizes[postid] = length; + return compressedbuffer->size(); + } + + map > bitmapmap; + map > > shortlistmap; + map mapuncompsizes; + + const size_t mMaxId; //max value that can be stored in a list + const size_t threshold;//// 32 seems to be the recommended setting, no need to change it? + + vector recovbuffer; + + IntegerCODEC & codec;// how we compress the short lists + intersectionfunction Inter; + +}; + + + +/** + * This is a version of HybM2 without compression (other than the bitmaps). + */ +class UncompressedHybM2 { +public: + + UncompressedHybM2(intersectionfunction inter, uint32_t MaxId, + uint32_t th = 32) : + bitmapmap(), shortlistmap(), mapuncompsizes(), mMaxId(MaxId), + threshold(th), + Inter(inter) { + } + + /** + * Returns the *uncompressed* size of a given posting list (specified + * by ID). + */ + size_t getSizeInInts(uint32_t postId) { + return mapuncompsizes[postId]; + } + + + /** + * Load an array (data) of length "length" as the posting list corresponding to id postid + * into the data structure. The data will be either converted to a bitmap or compressed. + */ + size_t load(const uint32_t postid, const uint32_t * data, + const uint32_t length) { + if (length * threshold >= mMaxId) + return loadAsBitmap(postid, data, length); + else + return loadAsShortArray(postid, data, length); + } + + /** + * Check whether we have a posting list corresponding to postid + */ + bool hasBeenLoaded(const uint32_t postid) { + return ((shortlistmap.find(postid) != shortlistmap.end()) + || (bitmapmap.find(postid) != bitmapmap.end())); + } + + /** + * Compute the total of the volume of of posting lists + * corresponding to a query. + */ + size_t computeUnpackVolume(const vector & ids) { + size_t answer = 0; + for(uint32_t id : ids) { + answer += mapuncompsizes[id]; + } + return answer; + } + + + + /** + * Compute the total of the intersection of of posting lists + * corresponding to a query, and output how many integers are + * in + * + * ids: the query as a set of posting ids + * out: write to result + * sizeout: will indicate how many integers were written to out. + * + * return unpack volume defined as the total volume of the posting + * lists that were needed to compute the intersection (which can + * be less than the total volume possible due to early abandoning). + */ + size_t intersect(const vector & ids, uint32_t * out, + size_t & sizeout) { + if (ids.empty()) { + sizeout = 0; + return 0; + } + vector>> > shortlists; + vector>> bitmaps; + //vector bitmapscard; + + for(uint32_t id : ids) { + if(shortlistmap.find(id)!=shortlistmap.end()) + shortlists.push_back(make_pair(mapuncompsizes[id],shortlistmap[id])); + else { + assert(bitmapmap.find(id)!=bitmapmap.end()); + bitmaps.push_back(make_pair(mapuncompsizes[id],bitmapmap[id]));} + } + size_t unpackVolume = 0; + if(shortlists.empty()) { + if(bitmaps.size() == 1) { + sizeout = bitmaps.front().second->toInts(out); + unpackVolume += sizeout; + return unpackVolume; + } + + BoolArray answer(mMaxId); + bitmaps[0].second->intersect(*bitmaps[1].second,answer); + unpackVolume += bitmaps[0].first + bitmaps[1].first; + for(uint32_t i = 2; i < bitmaps.size(); ++i) { + answer.inplaceIntersect(*bitmaps[i].second); + unpackVolume += bitmaps[i].first; + } + sizeout = answer.toInts(out); + return unpackVolume; + } else { + sort(shortlists.begin(),shortlists.end()); + sort(bitmaps.begin(),bitmaps.end()); + assert(sizeout>=shortlists[0].second->size()); + sizeout = shortlists[0].second->size(); + unpackVolume+=shortlists[0].second->size(); + assert(sizeout == shortlists[0].first); + // we have to make a copy because by convention the output is not directly from the index + const vector& firstvector = *shortlists[0].second; + for(uint32_t i = 0; i < firstvector.size();++i) + out[i] = firstvector[i]; + for(uint32_t i = 1; (sizeout>0) && (i < shortlists.size()); ++i) { + unpackVolume+=shortlists[i].first; + sizeout = Inter(out,sizeout,shortlists[i].second->data(),shortlists[i].second->size(),out); + } + size_t pos = 0; + for(uint32_t i = 0; (sizeout>0) && (i < bitmaps.size()); ++i) { + unpackVolume+=bitmaps[i].first; + shared_ptr & ba = bitmaps[i].second; + pos = 0; + for(uint32_t i = 0; i < sizeout; ++i) { + if(!ba->get(out[i])) + continue; + else + out[pos++] = out[i]; + } + sizeout = pos; + } + return unpackVolume; + } + } + + ~UncompressedHybM2() {} + + /** + * Estimate of the volume of data used by this object. + */ + size_t storageInBytes() const { + size_t answer = 0; + for(auto i : bitmapmap) + answer += i.second->sizeInBytes(); + for(auto i : shortlistmap) + answer += i.second->size() * sizeof(uint32_t); + return answer; + } + + +private: + + /** + * Load an array (data) of length "length" as the posting list corresponding to id postid + * as a bitmap. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsBitmap(const uint32_t postid, const uint32_t * data, const uint32_t length) { + if(bitmapmap.find(postid)!=bitmapmap.end()) return 0; + BoolArray * ba = new BoolArray(mMaxId); + for(uint32_t k = 0; k < length; ++k) + ba->set(data[k]); + bitmapmap[postid] = shared_ptr(ba); + mapuncompsizes[postid] = length; + return ba->sizeInBytes()/sizeof(uint32_t); + } + + /** + * Load an array (data) of length "length" as the posting list corresponding to id postid + * as a short array. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsShortArray(const uint32_t postid, const uint32_t * data, const uint32_t length) { + if(shortlistmap.find(postid)!=shortlistmap.end()) return 0; + mapuncompsizes[postid] = length; + vector * compressedbuffer = new vector(data,data+length); + shortlistmap[postid]=shared_ptr>(compressedbuffer); + return compressedbuffer->size(); + } + + map > bitmapmap; + map > > shortlistmap; + map mapuncompsizes; + + const size_t mMaxId; //max value that can be stored in a list + const size_t threshold;//// 32 seems to be the recommended setting, no need to change it? + + + intersectionfunction Inter; + +}; + + + + +/** + * This is a version of HybM2 without compression (other than the bitmaps). + */ +class SkippingHybM2 { +public: + + SkippingHybM2(uint32_t MaxId, + uint32_t th = 32, uint32_t BS = 8) : + bitmapmap(), shortlistmap(), mapuncompsizes(), mMaxId(MaxId), + threshold(th), + BlockSizeLog (BS) { + } + + /** + * Returns the *uncompressed* size of a given posting list (specified + * by ID). + */ + size_t getSizeInInts(uint32_t postId) { + return mapuncompsizes[postId]; + } + + + /** + * Load an array (data) of length "length" as the posting list corresponding to id postid + * into the data structure. The data will be either converted to a bitmap or compressed. + */ + size_t load(const uint32_t postid, const uint32_t * data, + const uint32_t length) { + if (length * threshold >= mMaxId) + return loadAsBitmap(postid, data, length); + else + return loadAsShortArray(postid, data, length); + } + + /** + * Check whether we have a posting list corresponding to postid + */ + bool hasBeenLoaded(const uint32_t postid) { + return ((shortlistmap.find(postid) != shortlistmap.end()) + || (bitmapmap.find(postid) != bitmapmap.end())); + } + + /** + * Compute the total of the volume of of posting lists + * corresponding to a query. + */ + size_t computeUnpackVolume(const vector & ids) { + size_t answer = 0; + for(uint32_t id : ids) { + answer += mapuncompsizes[id]; + } + return answer; + } + + + + /** + * Compute the total of the intersection of of posting lists + * corresponding to a query, and output how many integers are + * in + * + * ids: the query as a set of posting ids + * out: write to result + * sizeout: will indicate how many integers were written to out. + * + * return unpack volume defined as the total volume of the posting + * lists that were needed to compute the intersection (which can + * be less than the total volume possible due to early abandoning). + */ + size_t intersect(const vector & ids, uint32_t * out, + size_t & sizeout) { + if (ids.empty()) { + sizeout = 0; + return 0; + } + vector> > shortlists; + vector>> bitmaps; + //vector bitmapscard; + + for(uint32_t id : ids) { + if(shortlistmap.find(id)!=shortlistmap.end()) + shortlists.push_back(make_pair(mapuncompsizes[id],shortlistmap[id])); + else { + assert(bitmapmap.find(id)!=bitmapmap.end()); + bitmaps.push_back(make_pair(mapuncompsizes[id],bitmapmap[id]));} + } + size_t unpackVolume = 0; + if(shortlists.empty()) { + if(bitmaps.size() == 1) { + sizeout = bitmaps.front().second->toInts(out); + unpackVolume += sizeout; + return unpackVolume; + } + + BoolArray answer(mMaxId); + bitmaps[0].second->intersect(*bitmaps[1].second,answer); + unpackVolume += bitmaps[0].first + bitmaps[1].first; + for(uint32_t i = 2; i < bitmaps.size(); ++i) { + answer.inplaceIntersect(*bitmaps[i].second); + unpackVolume += bitmaps[i].first; + } + sizeout = answer.toInts(out); + return unpackVolume; + } else { + sort(shortlists.begin(),shortlists.end()); + sort(bitmaps.begin(),bitmaps.end()); + if (shortlists.size() == 1) { + sizeout = shortlists[0].second->decompress(out); + unpackVolume += shortlists[0].second->Length; + } else { + unpackVolume += shortlists[0].second->Length; + unpackVolume += shortlists[1].second->Length; + sizeout = shortlists[0].second->intersect(*shortlists[1].second, out); + for (uint32_t i = 2; (sizeout > 0) && (i < shortlists.size()); ++i) { + unpackVolume += shortlists[i].first; + sizeout = shortlists[i].second->intersect(out, sizeout, out); + } + } + size_t pos = 0; + for(uint32_t i = 0; (sizeout>0) && (i < bitmaps.size()); ++i) { + unpackVolume+=bitmaps[i].first; + shared_ptr & ba = bitmaps[i].second; + pos = 0; + for(uint32_t i = 0; i < sizeout; ++i) { + if(!ba->get(out[i])) + continue; + else + out[pos++] = out[i]; + } + sizeout = pos; + } + return unpackVolume; + } + } + + ~SkippingHybM2() {} + + /** + * Estimate of the volume of data used by this object. + */ + size_t storageInBytes() const { + size_t answer = 0; + for(auto i : bitmapmap) + answer += i.second->sizeInBytes(); + for(auto i : shortlistmap) + answer += i.second->storageInBytes(); + return answer; + } + + +private: + + /** + * Load an array (data) of length "length" as the posting list corresponding to id postid + * as a bitmap. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsBitmap(const uint32_t postid, const uint32_t * data, const uint32_t length) { + if(bitmapmap.find(postid)!=bitmapmap.end()) return 0; + BoolArray * ba = new BoolArray(mMaxId); + for(uint32_t k = 0; k < length; ++k) + ba->set(data[k]); + bitmapmap[postid] = shared_ptr(ba); + mapuncompsizes[postid] = length; + return ba->sizeInBytes()/sizeof(uint32_t); + } + + /** + * Load an array (data) of length "length" as the posting list corresponding to id postid + * as a short array. + * + * Do not call this directly, call load() instead. + */ + size_t loadAsShortArray(const uint32_t postid, const uint32_t * data, const uint32_t length) { + if(shortlistmap.find(postid)!=shortlistmap.end()) return 0; + + Skipping * compressedbuffer = new Skipping(BlockSizeLog,data,length); + shortlistmap[postid]=shared_ptr(compressedbuffer); + return compressedbuffer->storageInBytes()/sizeof(uint32_t); + } + + map > bitmapmap; + map > shortlistmap; + map mapuncompsizes; + + const size_t mMaxId; //max value that can be stored in a list + const size_t threshold;//// 32 seems to be the recommended setting, no need to change it? + uint32_t BlockSizeLog; + + + +}; +#endif /* HYBM2_H_ */ diff --git a/include/integratedbitpacking.h b/include/integratedbitpacking.h new file mode 100644 index 0000000..269714e --- /dev/null +++ b/include/integratedbitpacking.h @@ -0,0 +1,81 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef INTEGRATEDBITPACKING +#define INTEGRATEDBITPACKING +#include + + +void __integratedfastunpack0(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack1(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack2(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack3(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack4(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack5(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack6(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack7(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack8(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack9(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack10(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack11(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack12(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack13(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack14(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack15(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack16(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack17(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack18(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack19(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack20(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack21(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack22(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack23(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack24(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack25(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack26(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack27(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack28(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack29(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack30(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack31(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastunpack32(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); + + +void __integratedfastpack0(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack1(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack2(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack3(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack4(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack5(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack6(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack7(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack8(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack9(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack10(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack11(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack12(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack13(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack14(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack15(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack16(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack17(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack18(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack19(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack20(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack21(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack22(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack23(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack24(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack25(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack26(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack27(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack28(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack29(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack30(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack31(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); +void __integratedfastpack32(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out); + +#endif // INTEGRATEDBITPACKING diff --git a/include/intersection.h b/include/intersection.h new file mode 100644 index 0000000..d6b5ae4 --- /dev/null +++ b/include/intersection.h @@ -0,0 +1,94 @@ + + +#ifndef INTERSECTION_H_ +#define INTERSECTION_H_ + +#include + +using namespace std; +/* + * Given two arrays, this writes the intersection to out. Returns the + * cardinality of the intersection. + */ +typedef size_t (*intersectionfunction)(const uint32_t * set1, + const size_t length1, const uint32_t * set2, const size_t length2, uint32_t * out); + + +/* + * Given two arrays, this writes the intersection to out. Returns the + * cardinality of the intersection. + * + * This is a mix of very fast vectorized intersection algorithms, several + * designed by N. Kurz, with adaptations by D. Lemire. + */ +size_t SIMDintersection(const uint32_t * set1, + const size_t length1, const uint32_t * set2, const size_t length2, uint32_t *out); + + +/* + * Given two arrays, this writes the intersection to out. Returns the + * cardinality of the intersection. + * + * This is a well-written, but otherwise unsophisticated function. + * Written by N. Kurz. + */ +size_t nate_scalar(const uint32_t *set1, const size_t length1, + const uint32_t *set2, const size_t length2, uint32_t *out); + +/* + * Given two arrays, this writes the intersection to out. Returns the + * cardinality of the intersection. + * + * This applies a state-of-the-art algorithm. First coded by O. Kaser, adapted + * by D. Lemire. + */ +size_t onesidedgallopingintersection(const uint32_t * smallset, + const size_t smalllength, const uint32_t * largeset, + const size_t largelength, uint32_t * out) ; + + + + + + +class IntersectionFactory { +public: + static std::map intersection_schemes; + + static vector allNames() { + vector < string > ans; + for (auto i = intersection_schemes.begin(); i != intersection_schemes.end(); ++i) { + ans.push_back(i->first); + } + return ans; + } + + static string getName(intersectionfunction v) { + for(auto i = intersection_schemes.begin(); i != intersection_schemes.end() ; ++i) { + if(i->second == v ) + return i->first; + } + return "UNKNOWN"; + } + + static bool valid(string name) { + return (intersection_schemes.find(name) != intersection_schemes.end()) ; + } + + static intersectionfunction getFromName(string name) { + if (intersection_schemes.find(name) == intersection_schemes.end()) { + cerr << "name " << name << " does not refer to an intersection procedure." << endl; + cerr << "possible choices:" << endl; + for (auto i = intersection_schemes.begin(); i != intersection_schemes.end(); ++i) { + cerr << static_cast (i->first) << endl;// useless cast, but just to be clear + } + return NULL; + } + return intersection_schemes[name]; + } + +}; + + + +#endif /* INTERSECTION_H_ */ diff --git a/include/mersenne.h b/include/mersenne.h new file mode 100644 index 0000000..30e8778 --- /dev/null +++ b/include/mersenne.h @@ -0,0 +1,98 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + */ + +#ifndef MERSENNE_H_ +#define MERSENNE_H_ + +#include "common.h" +#include "util.h" + +/** + * Mersenne twister - random number generator. + * Generate uniform distribution of 32 bit integers with the MT19937 algorithm. + * source: http://bannister.us/weblog/?s=Mersenne + */ +class ZRandom { + +public: + enum { + N = 624, M = 397 + }; + unsigned int MT[N + 1]; + unsigned int* map[N]; + int nValues; + + ZRandom(unsigned int iSeed = 20070102); + void seed(unsigned iSeed); + unsigned int getValue(); + unsigned int getValue(const uint32_t MaxValue); + double getDouble(); + bool test(const double p); + +}; + +ZRandom::ZRandom(unsigned iSeed) : + nValues(0) { + seed(iSeed); +} + +void ZRandom::seed(unsigned iSeed) { + nValues = 0; + // Seed the array used in random number generation. + MT[0] = iSeed; + for (int i = 1; i < N; ++i) { + MT[i] = 1 + (69069 * MT[i - 1]); + } + // Compute map once to avoid % in inner loop. + for (int i = 0; i < N; ++i) { + map[i] = MT + ((i + M) % N); + } +} + +inline bool ZRandom::test(const double p) { + return getDouble() <= p; +} +inline double ZRandom::getDouble() { + return double(getValue()) * (1.0 / 4294967296.0); +} + +unsigned int ZRandom::getValue(const uint32_t MaxValue) { + unsigned int used = MaxValue; + used |= used >> 1; + used |= used >> 2; + used |= used >> 4; + used |= used >> 8; + used |= used >> 16; + + // Draw numbers until one is found in [0,n] + unsigned int i; + do + i = getValue() & used; // toss unused bits to shorten search + while (i > MaxValue); + return i; +} + +unsigned int ZRandom::getValue() { + if (0 == nValues) { + MT[N] = MT[0]; + for (int i = 0; i < N; ++i) { + register unsigned y = (0x80000000 & MT[i]) | (0x7FFFFFFF + & MT[i + 1]); + register unsigned v = *(map[i]) ^ (y >> 1); + if (1 & y) + v ^= 2567483615; + MT[i] = v; + } + nValues = N; + } + register unsigned y = MT[N - nValues--]; + y ^= y >> 11; + y ^= static_cast((y << 7) & 2636928640); + y ^= static_cast((y << 15) & 4022730752); + y ^= y >> 18; + return y; +} + +#endif /* MERSENNE_H_ */ diff --git a/include/simdbinarypacking.h b/include/simdbinarypacking.h new file mode 100644 index 0000000..7e42b66 --- /dev/null +++ b/include/simdbinarypacking.h @@ -0,0 +1,234 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#ifndef SIMDBINARYPACKING_H_ +#define SIMDBINARYPACKING_H_ + +#include "codecs.h" +#include "simdbitpackinghelpers.h" +#include "util.h" + + +template +struct SIMDBlockPacker { + typedef SIMDDeltaProcessor DeltaProcessor; + static void unpackblock(const uint32_t * in, uint32_t * out, const uint32_t bit, __m128i & initoffset ) { + if(ArrayDispatch) + ArrayDispatch::SIMDunpack(reinterpret_cast(in),out,bit); + else + simdunpack(reinterpret_cast(in),out,bit); + if(bit<32) { + initoffset = DeltaProcessor::runPrefixSum(initoffset,out); + } else { + initoffset = _mm_load_si128(reinterpret_cast<__m128i*>(out+SIMDBlockSize - 4)); + } + } + + static uint32_t maxbits(const uint32_t * in, __m128i & initoffset) { + const __m128i* pin = reinterpret_cast(in); + __m128i newvec = _mm_load_si128(pin); + __m128i accumulator = DeltaHelper::Delta(newvec , initoffset); + __m128i oldvec = newvec; + for(uint32_t k = 1; 4*k < SIMDBlockSize; ++k) { + newvec = _mm_load_si128(pin+k); + accumulator = _mm_or_si128(accumulator,DeltaHelper::Delta(newvec , oldvec)); + oldvec = newvec; + } + initoffset = oldvec; + return maxbitas32int(accumulator); + } + + static void packblockwithoutmask(uint32_t * in, uint32_t * out, const uint32_t bit, __m128i & initoffset ) { + __m128i nextoffset = _mm_load_si128(reinterpret_cast<__m128i*>(in+SIMDBlockSize - 4)); + if(bit<32) + DeltaProcessor::runDelta(initoffset,in); + if(ArrayDispatch) + ArrayDispatch::SIMDpackwithoutmask(in,reinterpret_cast<__m128i*>(out),bit); + else + simdpackwithoutmask(in,reinterpret_cast<__m128i*>(out),bit); + initoffset = nextoffset; + } + + static string name() { + if(ArrayDispatch) + return string("SIMDBlockPackerAD+") + DeltaHelper::name(); + else + return string("SIMDBlockPacker+") + DeltaHelper::name(); + } + +}; + + +template +struct SIMDIntegratedBlockPacker { + + static void unpackblock(const uint32_t * in, uint32_t * out, const uint32_t bit, __m128i & initoffset ) { + if(ArrayDispatch) + initoffset =IntegratedArrayDispatch::SIMDiunpack(initoffset,reinterpret_cast(in),out,bit); + else + initoffset =SIMDiunpack(initoffset,reinterpret_cast(in),out,bit); + } + + + static uint32_t maxbits(const uint32_t * in, __m128i & initoffset) { + const __m128i* pin = reinterpret_cast(in); + __m128i newvec = _mm_load_si128(pin); + __m128i accumulator = DeltaHelper::Delta(newvec , initoffset); + __m128i oldvec = newvec; + for(uint32_t k = 1; 4*k < SIMDBlockSize; ++k) { + newvec = _mm_load_si128(pin+k); + accumulator = _mm_or_si128(accumulator,DeltaHelper::Delta(newvec , oldvec)); + oldvec = newvec; + } + initoffset = oldvec; + return maxbitas32int(accumulator); + } + + static void packblockwithoutmask(uint32_t * in, uint32_t * out, const uint32_t bit, __m128i & initoffset ) { + __m128i nextoffset = _mm_load_si128(reinterpret_cast<__m128i*>(in+SIMDBlockSize - 4)); + if(ArrayDispatch) + IntegratedArrayDispatch::SIMDipackwithoutmask(initoffset,in,reinterpret_cast<__m128i*>(out),bit); + else + SIMDipackwithoutmask(initoffset,in,reinterpret_cast<__m128i*>(out),bit); + initoffset = nextoffset; + } + + static string name() { + if(ArrayDispatch) + return string("SIMDIntegratedBlockPackerAD+") + DeltaHelper::name(); + else + return string("SIMDIntegratedBlockPacker+") + DeltaHelper::name(); + } + +}; + +/** + * + * + * Code data in miniblocks of 128 integers. + * To preserve alignment, we regroup + * 8 such miniblocks into a block of 8 * 128 = 1024 + * integers. + * + */ +template +class SIMDBinaryPacking: public IntegerCODEC { +public: + static const uint32_t CookiePadder = 123456;// just some made up number + static const uint32_t MiniBlockSize = 128; + static const uint32_t HowManyMiniBlocks = 16; + static const uint32_t BlockSize = MiniBlockSize;//HowManyMiniBlocks * MiniBlockSize; + + + + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + checkifdivisibleby(length, BlockSize); + const uint32_t * const initout(out); + if(needPaddingTo128Bits(out) or needPaddingTo128Bits(in)) throw std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + *out++ = static_cast(length); + while(needPaddingTo128Bits(out)) *out++ = CookiePadder; + uint32_t Bs[HowManyMiniBlocks]; + __m128i init = _mm_set1_epi32(0); + const uint32_t * const final = in + length; + for (; in + HowManyMiniBlocks * MiniBlockSize + <= final; in += HowManyMiniBlocks * MiniBlockSize) { + __m128i tmpinit = init; + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize,tmpinit); + } + *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) + | Bs[3]; + *out++ = (Bs[4] << 24) | (Bs[5] << 16) | (Bs[6] << 8) + | Bs[7]; + *out++ = (Bs[8] << 24) | (Bs[9] << 16) | (Bs[10] << 8) + | Bs[11]; + *out++ = (Bs[12] << 24) | (Bs[13] << 16) | (Bs[14] << 8) + | Bs[15]; + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i],init); + out += MiniBlockSize/32 * Bs[i]; + } + } + if(in < final) { + const size_t howmany = ( final - in ) /MiniBlockSize; + __m128i tmpinit = init; + memset(&Bs[0],0,HowManyMiniBlocks*sizeof(uint32_t)); + for (uint32_t i = 0; i < howmany; ++i) { + Bs[i] = BlockPacker::maxbits(in + i * MiniBlockSize,tmpinit); + } + *out++ = (Bs[0] << 24) | (Bs[1] << 16) | (Bs[2] << 8) + | Bs[3]; + *out++ = (Bs[4] << 24) | (Bs[5] << 16) | (Bs[6] << 8) + | Bs[7]; + *out++ = (Bs[8] << 24) | (Bs[9] << 16) | (Bs[10] << 8) + | Bs[11]; + *out++ = (Bs[12] << 24) | (Bs[13] << 16) | (Bs[14] << 8) + | Bs[15]; + for (uint32_t i = 0; i < howmany; ++i) { + BlockPacker::packblockwithoutmask(in + i * MiniBlockSize, out, Bs[i],init); + out += MiniBlockSize/32 * Bs[i]; + } + in += howmany * MiniBlockSize; + assert(in == final); + } + nvalue = out - initout; + } + + const uint32_t * decodeArray(const uint32_t *in, const size_t /*length*/, + uint32_t *out, size_t & nvalue) { + if(needPaddingTo128Bits(out) or needPaddingTo128Bits(in)) throw std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + const uint32_t actuallength = *in++; + while(needPaddingTo128Bits(in)) { + if(in[0] != CookiePadder) throw logic_error("SIMDBinaryPacking alignment issue."); + ++in; + } + const uint32_t * const initout(out); + uint32_t Bs[HowManyMiniBlocks]; + __m128i init = _mm_set1_epi32(0); + for (; out < initout + actuallength/(HowManyMiniBlocks * MiniBlockSize) *HowManyMiniBlocks * MiniBlockSize ; + out += HowManyMiniBlocks * MiniBlockSize) { + for(uint32_t i = 0; i < 4 ; ++i,++in) { + Bs[0 + 4 * i] = static_cast(in[0] >> 24); + Bs[1 + 4 * i] = static_cast(in[0] >> 16); + Bs[2 + 4 * i] = static_cast(in[0] >> 8); + Bs[3 + 4 * i] = static_cast(in[0]); + } + for (uint32_t i = 0; i < HowManyMiniBlocks; ++i) { + BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i],init); + in += MiniBlockSize/32 * Bs[i]; + } + } + + if(out < initout + actuallength) { + const size_t howmany = ( initout + actuallength - out ) /MiniBlockSize; + for(uint32_t i = 0; i < 4 ; ++i,++in) { + Bs[0 + 4 * i] = static_cast(in[0] >> 24); + Bs[1 + 4 * i] = static_cast(in[0] >> 16); + Bs[2 + 4 * i] = static_cast(in[0] >> 8); + Bs[3 + 4 * i] = static_cast(in[0]); + } + for (uint32_t i = 0; i < howmany; ++i) { + BlockPacker::unpackblock(in, out + i * MiniBlockSize, Bs[i],init); + in += MiniBlockSize/32 * Bs[i]; + } + out += howmany*MiniBlockSize; + assert(out == initout + actuallength); + } + nvalue = out - initout; + return in; + } + + string name() const { + ostringstream convert; + convert << "SIMDBinaryPacking"<<"With"< +struct SIMDBitPackingHelpers { + + static void pack(uint32_t * in, const size_t Qty, uint32_t * out, const uint32_t bit ) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + if (SIMDBlockSize % 32) { + throw std::logic_error("Incorrect SIMDBlockSize."); + } + __m128i initoffset = _mm_set1_epi32(0); + + for(size_t k = 0; k < Qty/SIMDBlockSize; ++k) { + __m128i nextoffset = _mm_load_si128(reinterpret_cast<__m128i*>((in+k*SIMDBlockSize+SIMDBlockSize - 4))); + + if(bit<32) SIMDDeltaProcessor::runDelta(initoffset,in+k*SIMDBlockSize); + simdpack(in+k*SIMDBlockSize,reinterpret_cast<__m128i*>(out+SIMDBlockSize*k*bit/32),bit); + initoffset = nextoffset; + } + } + + + static void unpack(const uint32_t * in, size_t Qty, uint32_t * out, const uint32_t bit ) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + __m128i initoffset = _mm_set1_epi32(0); + + for(size_t k = 0; k < Qty/SIMDBlockSize; ++k) { + simdunpack(reinterpret_cast(in+SIMDBlockSize*k*bit/32),out+k*SIMDBlockSize,bit); + if(bit<32) { + initoffset = SIMDDeltaProcessor::runPrefixSum(initoffset,out+k*SIMDBlockSize); + } + } + } + + static void packwithoutmask(uint32_t * in, const size_t Qty, uint32_t * out, const uint32_t bit ) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + __m128i initoffset = _mm_set1_epi32(0); + + for(size_t k = 0; k < Qty/SIMDBlockSize; ++k) { + __m128i nextoffset = _mm_load_si128(reinterpret_cast<__m128i*>((in+k*SIMDBlockSize+SIMDBlockSize - 4))); + if(bit<32) SIMDDeltaProcessor::runDelta(initoffset,in+k*SIMDBlockSize); + simdpackwithoutmask(in+k*SIMDBlockSize,reinterpret_cast<__m128i*>(out+SIMDBlockSize*k*bit/32),bit); + initoffset = nextoffset; + } + } + + static void ipack(const uint32_t * in, const size_t Qty, uint32_t * _out, const uint32_t bit ) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + __m128i *out = reinterpret_cast<__m128i*>(_out); + __m128i initoffset = _mm_set1_epi32(0U);; + + for(size_t k = 0; k < Qty/SIMDBlockSize; ++k) { + SIMDipack(initoffset,in+k*SIMDBlockSize,out+k*bit,bit); + initoffset = _mm_load_si128 (reinterpret_cast(in+k*SIMDBlockSize+SIMDBlockSize - 4)); + //memcpy(&initoffset, (in+k*SIMDBlockSize+SIMDBlockSize - 4), sizeof initoffset);// Daniel: memcpy looks like a hack + } + } + + static void ipackwithoutmask(const uint32_t * in, const size_t Qty, uint32_t * _out, const uint32_t bit ) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + __m128i *out = reinterpret_cast<__m128i*>(_out); + __m128i initoffset = _mm_set1_epi32(0U);; + + for(size_t k = 0; k < Qty/SIMDBlockSize; ++k) { + SIMDipackwithoutmask(initoffset,in+k*SIMDBlockSize,out+k*bit,bit); + initoffset = _mm_load_si128 (reinterpret_cast(in+k*SIMDBlockSize+SIMDBlockSize - 4)); + //memcpy(&initoffset, (in+k*SIMDBlockSize+SIMDBlockSize - 4), sizeof initoffset);// Daniel: memcpy looks like a hack + } + } + + static void iunpack(const uint32_t * _in, size_t Qty, uint32_t * out, const uint32_t bit ) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + const __m128i *in = reinterpret_cast(_in); + + __m128i initoffset = _mm_set1_epi32(0U);; + + for(size_t k = 0; k < Qty/SIMDBlockSize; ++k) { + initoffset = SIMDiunpack(initoffset,in+k*bit,out+k*SIMDBlockSize,bit); + } + } + + // this is not expected to be useful, only for benchmarking + static void ipatchedunpack(const uint32_t * _in, size_t Qty, uint32_t * out, const uint32_t bit ) { + if (Qty % SIMDBlockSize) { + throw std::logic_error("Incorrect # of entries."); + } + const __m128i *in = reinterpret_cast(_in); + + __m128i initoffset = _mm_set1_epi32(0U);; + + for(size_t k = 0; k < Qty/SIMDBlockSize; ++k) { + initoffset = SIMDipatchedunpack(initoffset,in+k*bit,out+k*SIMDBlockSize,reinterpret_cast(out+k*SIMDBlockSize),bit); + } + } + + + static void CheckMaxDiff(const std::vector& refdata, unsigned bit) { + for(size_t i = 4; i < refdata.size(); ++i ) { + if(gccbits(refdata[i]-refdata[i-4])>bit) throw std::runtime_error("bug"); + } + } +}; + +#endif diff --git a/include/simdfastpfor.h b/include/simdfastpfor.h new file mode 100644 index 0000000..2fe515e --- /dev/null +++ b/include/simdfastpfor.h @@ -0,0 +1,476 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef SIMDFASTPFOR_H_ +#define SIMDFASTPFOR_H_ + +#include "common.h" +#include "codecs.h" +#include "sortedbitpacking.h" +#include "simdbitpacking.h" +#include "util.h" +#include "delta.h" + + + +/** + * SIMDFastPFor + * + * Reference and documentation: + * + * Daniel Lemire and Leonid Boytsov, Decoding billions of integers per second through vectorization + * http://arxiv.org/abs/1209.2137 + * + * Note that this implementation is slightly improved compared to the version presented + * in the paper. + * + * Designed by D. Lemire with ideas from Leonid Boytsov. This scheme is NOT patented. + * + */ +template , bool arraydispatch=true > +class SIMDFastPFor: public IntegerCODEC { +public: + /** + * ps (page size) should be a multiple of BlockSize, any "large" + * value should do. + */ + SIMDFastPFor(uint32_t ps = 65536) : + PageSize(ps), bitsPageSize(gccbits(PageSize)), bpacker(), + bytescontainer(PageSize + 3 * PageSize / BlockSize) { + assert(ps / BlockSize * BlockSize == ps); + assert(gccbits(BlockSizeInUnitsOfPackSize * PACKSIZE - 1) <= 8); + } + enum { + BlockSizeInUnitsOfPackSize = 4, + PACKSIZE = 32, + overheadofeachexcept = 8, + overheadduetobits = 8, + overheadduetonmbrexcept = 8, + BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE + }; + + + + const uint32_t PageSize; + const uint32_t bitsPageSize; + SortedBitPacker bpacker; + vector bytescontainer; + + const uint32_t * decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + if(needPaddingTo128Bits(out) or needPaddingTo128Bits(in)) throw std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + const uint32_t * const initin(in); + const size_t mynvalue = *in; + ++in; + if (mynvalue > nvalue) + throw NotEnoughStorage(mynvalue); + nvalue = mynvalue; + const uint32_t * const finalout(out + nvalue); + __m128i prev = _mm_set1_epi32 (0); + while (out != finalout) { + size_t thisnvalue(0); + size_t thissize = + static_cast (finalout > PageSize + out ? PageSize + : (finalout - out)); + + __decodeArray(in, thisnvalue, out, thissize,prev); + in += thisnvalue; + out += thissize; + } + assert(initin + length >= in); + bpacker.reset();// if you don't do this, the codec has a "memory". + return in; + } + + /** + * If you save the output and recover it in memory, you are + * responsible to ensure that the alignment is preserved. + * + * The input size (length) should be a multiple of + * BlockSizeInUnitsOfPackSize * PACKSIZE. (This was done + * to simplify slightly the implementation.) + */ + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + if(needPaddingTo128Bits(out) or needPaddingTo128Bits(in)) throw std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + checkifdivisibleby(length, BlockSize); + const uint32_t * const initout(out); + const uint32_t * const finalin(in + length); + + *out++ = static_cast(length); + const size_t oldnvalue = nvalue; + nvalue = 1; + __m128i prev = _mm_set1_epi32 (0); + while (in != finalin) { + size_t thissize = + static_cast (finalin > PageSize + in ? PageSize + : (finalin - in)); + size_t thisnvalue(0); + __encodeArray(in, thissize, out, thisnvalue,prev); + nvalue += thisnvalue; + out += thisnvalue; + in += thissize; + } + assert(out == nvalue + initout); + if (oldnvalue < nvalue) + cerr << "It is possible we have a buffer overrun. " << endl; + bpacker.reset();// if you don't do this, the buffer has a memory + } + + + void getBestBFromData(const uint32_t * in, uint8_t& bestb, + uint8_t & bestcexcept, uint8_t & maxb) { + uint32_t freqs[33]; + for (uint32_t k = 0; k <= 32; ++k) + freqs[k] = 0; + for (uint32_t k = 0; k < BlockSize; ++k) { + freqs[asmbits(in[k])]++; + } + bestb = 32; + while (freqs[bestb] == 0) + bestb--; + maxb = bestb; + uint32_t bestcost = bestb * BlockSize; + uint32_t cexcept = 0; + bestcexcept = static_cast(cexcept); + for (uint32_t b = bestb - 1; b < 32; --b) { + cexcept += freqs[b + 1]; + uint32_t thiscost = cexcept * overheadofeachexcept + cexcept + * (maxb - b) + b * BlockSize + 8;// the extra 8 is the cost of storing maxbits + if (thiscost < bestcost) { + bestcost = thiscost; + bestb = static_cast(b); + bestcexcept = static_cast(cexcept); + } + } + } + + void __encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t & nvalue, __m128i & prev) {// = _mm_set1_epi32 (0);// for delta + uint32_t * const initout = out; // keep track of this + checkifdivisibleby(length, BlockSize); + uint32_t * const headerout = out++; // keep track of this + bpacker.clear(); + uint8_t * bc = bytescontainer.data(); + out = padTo128bits(out); + if(needPaddingTo128Bits(in)) throw std::runtime_error("alignment bug"); + for (const uint32_t * const final = in + length; (in + BlockSize + <= final); in += BlockSize) { + uint8_t bestb, bestcexcept, maxb; + + const __m128i nextprev = _mm_load_si128 (reinterpret_cast(in+BlockSize-4)); + SIMDDeltaProcessor::runDelta(prev,in); + prev = nextprev; + + getBestBFromData(in, bestb, bestcexcept, maxb); + *bc++ = bestb; + *bc++ = bestcexcept; + if (bestcexcept > 0) { + *bc++ = maxb; + bpacker.ensureCapacity(maxb - bestb - 1,bestcexcept); + const uint32_t maxval = 1U << bestb; + for (uint32_t k = 0; k < BlockSize; ++k) { + if (in[k] >= maxval) { + bpacker.directAppend(maxb - bestb - 1, in[k] >> bestb); + *bc++ = static_cast(k); + } + } + } + simdpack(in, reinterpret_cast<__m128i *>(out), bestb); + out += 4 * bestb; + } + headerout[0] = static_cast (out - headerout); + const uint32_t bytescontainersize = static_cast(bc - bytescontainer.data()); + *(out++) = bytescontainersize; + memcpy(out, bytescontainer.data(), bytescontainersize); + out += (bytescontainersize + sizeof(uint32_t) - 1) + / sizeof(uint32_t); + const uint32_t * const lastout = bpacker.write(out); + nvalue = lastout - initout; + } + + void __decodeArray(const uint32_t *in, size_t & length, uint32_t *out, + const size_t nvalue, __m128i & prev) { + const uint32_t * const initin = in; + const uint32_t * const headerin = in++; + const uint32_t wheremeta = headerin[0]; + const uint32_t *inexcept = headerin + wheremeta; + const uint32_t bytesize = *inexcept++; + const uint8_t * bytep = reinterpret_cast (inexcept); + + inexcept += (bytesize + sizeof(uint32_t) - 1) / sizeof(uint32_t); + inexcept = bpacker.read(inexcept); + length = inexcept - initin; + const uint32_t * unpackpointers[32 + 1]; + for (uint32_t k = 1; k <= 32; ++k) { + unpackpointers[k] = bpacker.get(k-1); + } + in = padTo128bits(in); + assert(!needPaddingTo128Bits(out)); + for (uint32_t run = 0; run < nvalue / BlockSize; ++run, out + += BlockSize) { + const uint8_t b = *bytep++; + const uint8_t cexcept = *bytep++; + if(arraydispatch) + simdunpack(reinterpret_cast(in), out, b); + else + ArrayDispatch::SIMDunpack(reinterpret_cast(in), out, b); + in += 4*b; + if (cexcept > 0) { + const uint8_t maxbits = *bytep++; + const uint32_t * vals = unpackpointers[maxbits - b]; + unpackpointers[maxbits - b] += cexcept; + for (uint32_t k = 0; k < cexcept; ++k) { + const uint8_t pos = *(bytep++); + out[pos] |= vals[k] << b; + } + } + prev = SIMDDeltaProcessor::runPrefixSum(prev,out); + + } + + assert(in == headerin + wheremeta); + } + + string name() const { + return string("SIMDFastPFor")+DeltaHelper::name()+SortedBitPacker::name(); + } + +}; + + + + + +/** + * This version uses integrated differential coding + */ +template > +class SIMDIntegratedFastPFor: public IntegerCODEC { +public: + /** + * ps (page size) should be a multiple of BlockSize, any "large" + * value should do. + */ + SIMDIntegratedFastPFor(uint32_t ps = 65536) : + PageSize(ps), bitsPageSize(gccbits(PageSize)), bpacker(), + bytescontainer(PageSize + 3 * PageSize / BlockSize), patchedbuffer(BlockSize) { + assert(ps / BlockSize * BlockSize == ps); + assert(gccbits(BlockSizeInUnitsOfPackSize * PACKSIZE - 1) <= 8); + } + enum { + BlockSizeInUnitsOfPackSize = 4, + PACKSIZE = 32, + overheadofeachexcept = 8, + overheadduetobits = 8, + overheadduetonmbrexcept = 8, + BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE + }; + + + + const uint32_t PageSize; + const uint32_t bitsPageSize; + SortedBitPacker bpacker; + vector bytescontainer; + vector patchedbuffer; + + const uint32_t * decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + if(needPaddingTo128Bits(out) or needPaddingTo128Bits(in)) throw std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + const uint32_t * const initin(in); + const size_t mynvalue = *in; + ++in; + if (mynvalue > nvalue) + throw NotEnoughStorage(mynvalue); + nvalue = mynvalue; + const uint32_t * const finalout(out + nvalue); + __m128i prev = _mm_set1_epi32 (0); + while (out != finalout) { + size_t thisnvalue(0); + size_t thissize = + static_cast (finalout > PageSize + out ? PageSize + : (finalout - out)); + + __decodeArray(in, thisnvalue, out, thissize,prev); + in += thisnvalue; + out += thissize; + } + assert(initin + length >= in); + bpacker.reset();// if you don't do this, the codec has a "memory". + return in; + } + + /** + * If you save the output and recover it in memory, you are + * responsible to ensure that the alignment is preserved. + * + * The input size (length) should be a multiple of + * BlockSizeInUnitsOfPackSize * PACKSIZE. (This was done + * to simplify slightly the implementation.) + */ + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + if(needPaddingTo128Bits(out) or needPaddingTo128Bits(in)) throw std::runtime_error("alignment issue: pointers should be aligned on 128-bit boundaries"); + checkifdivisibleby(length, BlockSize); + const uint32_t * const initout(out); + const uint32_t * const finalin(in + length); + + *out++ = static_cast(length); + const size_t oldnvalue = nvalue; + __m128i prev = _mm_set1_epi32 (0); + nvalue = 1; + while (in != finalin) { + size_t thissize = + static_cast (finalin > PageSize + in ? PageSize + : (finalin - in)); + size_t thisnvalue(0); + __encodeArray(in, thissize, out, thisnvalue,prev); + nvalue += thisnvalue; + out += thisnvalue; + in += thissize; + } + assert(out == nvalue + initout); + if (oldnvalue < nvalue) + cerr << "It is possible we have a buffer overrun. " << endl; + bpacker.reset();// if you don't do this, the buffer has a memory + } + + + void getBestBFromData(const uint32_t * in, uint8_t& bestb, + uint8_t & bestcexcept, uint8_t & maxb) { + uint32_t freqs[33]; + for (uint32_t k = 0; k <= 32; ++k) + freqs[k] = 0; + for (uint32_t k = 0; k < BlockSize; ++k) { + freqs[asmbits(in[k])]++; + } + bestb = 32; + while (freqs[bestb] == 0) + bestb--; + maxb = bestb; + + uint32_t bestcost = bestb * BlockSize; + uint32_t cexcept = 0; + bestcexcept = static_cast(cexcept); + for (uint32_t b = bestb - 1; b < 32; --b) { + cexcept += freqs[b + 1]; + uint32_t thiscost = cexcept * overheadofeachexcept + cexcept + * (maxb - b) + b * BlockSize + 8;// the extra 8 is the cost of storing maxbits + if (thiscost < bestcost) { + bestcost = thiscost; + bestb = static_cast(b); + bestcexcept = static_cast(cexcept); + } + } + } + + void __encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t & nvalue, __m128i & prev) { + //__m128i prev = _mm_set1_epi32 (0);// for delta + + uint32_t * const initout = out; // keep track of this + checkifdivisibleby(length, BlockSize); + uint32_t * const headerout = out++; // keep track of this + bpacker.clear(); + uint8_t * bc = bytescontainer.data(); + out = padTo128bits(out); + if(needPaddingTo128Bits(in)) throw std::runtime_error("alignment bug"); + for (const uint32_t * const final = in + length; (in + BlockSize + <= final); in += BlockSize) { + uint8_t bestb, bestcexcept, maxb; + //TODO: implement integrated packing + const __m128i nextprev = _mm_load_si128 (reinterpret_cast(in+BlockSize-4)); + SIMDDeltaProcessor::runDelta(prev,in); + prev = nextprev; + getBestBFromData(in, bestb, bestcexcept, maxb); + *bc++ = bestb; + *bc++ = bestcexcept; + if (bestcexcept > 0) { + *bc++ = maxb; + bpacker.ensureCapacity(maxb - bestb - 1,bestcexcept); + const uint32_t maxval = 1U << bestb; + for (uint32_t k = 0; k < BlockSize; ++k) { + if (in[k] >= maxval) { + bpacker.directAppend(maxb - bestb - 1, in[k] >> bestb); + *bc++ = static_cast(k); + } + } + } + simdpack(in, reinterpret_cast<__m128i *>(out), bestb); + out += 4 * bestb; + } + headerout[0] = static_cast (out - headerout); + const uint32_t bytescontainersize = static_cast(bc - bytescontainer.data()); + *(out++) = bytescontainersize; + memcpy(out, bytescontainer.data(), bytescontainersize); + out += (bytescontainersize + sizeof(uint32_t) - 1) + / sizeof(uint32_t); + const uint32_t * const lastout = bpacker.write(out); + nvalue = lastout - initout; + } + + void __decodeArray(const uint32_t *in, size_t & length, uint32_t *out, + const size_t nvalue, __m128i & prev) { + //__m128i prev = _mm_set1_epi32 (0);// for delta + + const uint32_t * const initin = in; + const uint32_t * const headerin = in++; + const uint32_t wheremeta = headerin[0]; + const uint32_t *inexcept = headerin + wheremeta; + const uint32_t bytesize = *inexcept++; + const uint8_t * bytep = reinterpret_cast (inexcept); + + inexcept += (bytesize + sizeof(uint32_t) - 1) / sizeof(uint32_t); + inexcept = bpacker.read(inexcept); + length = inexcept - initin; + const uint32_t * unpackpointers[32 + 1]; + for (uint32_t k = 1; k <= 32; ++k) { + unpackpointers[k] = bpacker.get(k-1); + } + in = padTo128bits(in); + assert(!needPaddingTo128Bits(out)); + assert(!needPaddingTo128Bits(patchedbuffer.data())); + + for (uint32_t run = 0; run < nvalue / BlockSize; ++run, out + += BlockSize) { + const uint32_t b = *bytep++; + const uint32_t cexcept = *bytep++; + if (cexcept > 0) { + const uint8_t maxbits = *bytep++; + const uint32_t * vals = unpackpointers[maxbits - b]; + unpackpointers[maxbits - b] += cexcept; + patchedbuffer[bytep[0]] + = vals[0]<::SIMDipatchedunpack(prev, + reinterpret_cast (in), out, + reinterpret_cast (patchedbuffer.data()), b); + in += 4 * b; + if (cexcept > 0) { + patchedbuffer[bytep[0]] = 0; + for (uint32_t k = 1; k < cexcept; ++k) { + patchedbuffer[bytep[k]] = 0; + } + bytep += cexcept; + } + } + + assert(in == headerin + wheremeta); + } + + string name() const { + return string("SIMDIntegratedFastPFor")+DeltaHelper::name()+SortedBitPacker::name(); + } + +}; + + +#endif /* SIMDFASTPFOR_H_ */ diff --git a/include/simdintegratedbitpacking.h b/include/simdintegratedbitpacking.h new file mode 100644 index 0000000..9cf37c8 --- /dev/null +++ b/include/simdintegratedbitpacking.h @@ -0,0 +1,695 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Leonid Boytsov, Nathan Kurz and Daniel Lemire + */ + +#ifndef SIMD_INTEGRATED_BITPACKING_H +#define SIMD_INTEGRATED_BITPACKING_H + + +/** + * To avoid crazy dependencies, this header should not + * include any other header beside delta.h. + */ +#include "deltatemplates.h" + + + +template +__m128i iunpack0(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack0(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack0(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack1(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack1(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack1(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask1(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack2(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack2(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack2(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask2(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack3(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack3(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack3(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask3(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack4(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack4(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack4(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask4(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack5(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack5(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack5(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask5(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack6(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack6(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack6(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask6(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack7(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack7(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack7(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask7(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack8(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack8(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack8(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask8(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack9(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack9(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack9(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask9(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack10(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack10(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack10(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask10(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack11(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack11(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack11(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask11(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack12(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack12(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack12(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask12(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack13(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack13(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack13(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask13(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack14(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack14(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack14(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask14(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack15(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack15(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack15(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask15(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack16(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack16(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack16(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask16(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack17(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack17(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack17(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask17(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack18(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack18(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack18(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask18(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack19(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack19(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack19(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask19(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack20(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack20(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack20(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask20(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack21(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack21(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack21(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask21(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack22(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack22(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack22(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask22(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack23(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack23(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack23(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask23(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack24(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack24(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack24(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask24(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack25(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack25(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack25(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask25(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack26(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack26(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack26(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask26(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack27(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack27(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack27(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask27(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack28(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack28(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack28(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask28(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack29(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack29(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack29(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask29(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack30(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack30(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack30(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask30(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack31(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack31(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack31(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask31(__m128i, const uint32_t *, __m128i *); + + +template +__m128i iunpack32(__m128i, const __m128i *, uint32_t *); +template +__m128i ipatchedunpack32(__m128i, const __m128i *, uint32_t *, const __m128i *); +template +void ipack32(__m128i, const uint32_t *, __m128i *); +template +void ipackwithoutmask32(__m128i, const uint32_t *, __m128i *); + + +typedef __m128i (*integratedunpackingfunction)(__m128i, const __m128i *, uint32_t *); +typedef __m128i (*integratedpatchedunpackingfunction)(__m128i, const __m128i *, uint32_t *, const __m128i *); + +typedef void (*integratedpackingfunction)(__m128i, const uint32_t *, __m128i *); + + +template +struct IntegratedArrayDispatch { + static integratedunpackingfunction unpack[33]; + + static inline __m128i SIMDiunpack(__m128i initOffset, const __m128i * in, uint32_t * out, const uint32_t bit) { + return unpack[bit](initOffset,in,out); + } + static integratedpatchedunpackingfunction patchedunpack[33]; + + static inline __m128i SIMDipatchedunpack(__m128i initOffset, const __m128i * in, uint32_t * out, const __m128i * patchedbuffer, const uint32_t bit) { + return patchedunpack[bit](initOffset,in,out,patchedbuffer); + } + static integratedpackingfunction packwithoutmask[33]; + + static inline void SIMDipackwithoutmask(__m128i initOffset, const uint32_t * in, __m128i * out, const uint32_t bit) { + packwithoutmask[bit](initOffset,in,out); + } + static integratedpackingfunction pack[33]; + + static inline void SIMDipack(__m128i initOffset, const uint32_t * in, __m128i * out, const uint32_t bit) { + pack[bit](initOffset,in,out); + } +}; + +template +integratedunpackingfunction IntegratedArrayDispatch::unpack[33] ={iunpack0,iunpack1,iunpack2,iunpack3,iunpack4,iunpack5,iunpack6,iunpack7,iunpack8,iunpack9,iunpack10,iunpack11,iunpack12,iunpack13,iunpack14,iunpack15,iunpack16,iunpack17,iunpack18,iunpack19,iunpack20,iunpack21,iunpack22,iunpack23,iunpack24,iunpack25,iunpack26,iunpack27,iunpack28,iunpack29,iunpack30,iunpack31,iunpack32}; + +template +integratedpatchedunpackingfunction IntegratedArrayDispatch::patchedunpack[33] ={ipatchedunpack0,ipatchedunpack1,ipatchedunpack2,ipatchedunpack3,ipatchedunpack4,ipatchedunpack5,ipatchedunpack6,ipatchedunpack7,ipatchedunpack8,ipatchedunpack9,ipatchedunpack10,ipatchedunpack11,ipatchedunpack12,ipatchedunpack13,ipatchedunpack14,ipatchedunpack15,ipatchedunpack16,ipatchedunpack17,ipatchedunpack18,ipatchedunpack19,ipatchedunpack20,ipatchedunpack21,ipatchedunpack22,ipatchedunpack23,ipatchedunpack24,ipatchedunpack25,ipatchedunpack26,ipatchedunpack27,ipatchedunpack28,ipatchedunpack29,ipatchedunpack30,ipatchedunpack31,ipatchedunpack32}; + +template +integratedpackingfunction IntegratedArrayDispatch::packwithoutmask[33] = {ipackwithoutmask0,ipackwithoutmask1,ipackwithoutmask2,ipackwithoutmask3,ipackwithoutmask4,ipackwithoutmask5,ipackwithoutmask6,ipackwithoutmask7,ipackwithoutmask8,ipackwithoutmask9,ipackwithoutmask10,ipackwithoutmask11,ipackwithoutmask12,ipackwithoutmask13,ipackwithoutmask14,ipackwithoutmask15,ipackwithoutmask16,ipackwithoutmask17,ipackwithoutmask18,ipackwithoutmask19,ipackwithoutmask20,ipackwithoutmask21,ipackwithoutmask22,ipackwithoutmask23,ipackwithoutmask24,ipackwithoutmask25,ipackwithoutmask26,ipackwithoutmask27,ipackwithoutmask28,ipackwithoutmask29,ipackwithoutmask30,ipackwithoutmask31,ipackwithoutmask32}; + +template +integratedpackingfunction IntegratedArrayDispatch::pack[33] ={ipack0,ipack1,ipack2,ipack3,ipack4,ipack5,ipack6,ipack7,ipack8,ipack9,ipack10,ipack11,ipack12,ipack13,ipack14,ipack15,ipack16,ipack17,ipack18,ipack19,ipack20,ipack21,ipack22,ipack23,ipack24,ipack25,ipack26,ipack27,ipack28,ipack29,ipack30,ipack31,ipack32}; + + + +template +inline __m128i SIMDiunpack(__m128i initOffset, const __m128i * in, uint32_t * out, const uint32_t bit) { + switch(bit) { + case 0: return iunpack0(initOffset,in,out); + + case 1: return iunpack1(initOffset,in,out); + + case 2: return iunpack2(initOffset,in,out); + + case 3: return iunpack3(initOffset,in,out); + + case 4: return iunpack4(initOffset,in,out); + + case 5: return iunpack5(initOffset,in,out); + + case 6: return iunpack6(initOffset,in,out); + + case 7: return iunpack7(initOffset,in,out); + + case 8: return iunpack8(initOffset,in,out); + + case 9: return iunpack9(initOffset,in,out); + + case 10: return iunpack10(initOffset,in,out); + + case 11: return iunpack11(initOffset,in,out); + + case 12: return iunpack12(initOffset,in,out); + + case 13: return iunpack13(initOffset,in,out); + + case 14: return iunpack14(initOffset,in,out); + + case 15: return iunpack15(initOffset,in,out); + + case 16: return iunpack16(initOffset,in,out); + + case 17: return iunpack17(initOffset,in,out); + + case 18: return iunpack18(initOffset,in,out); + + case 19: return iunpack19(initOffset,in,out); + + case 20: return iunpack20(initOffset,in,out); + + case 21: return iunpack21(initOffset,in,out); + + case 22: return iunpack22(initOffset,in,out); + + case 23: return iunpack23(initOffset,in,out); + + case 24: return iunpack24(initOffset,in,out); + + case 25: return iunpack25(initOffset,in,out); + + case 26: return iunpack26(initOffset,in,out); + + case 27: return iunpack27(initOffset,in,out); + + case 28: return iunpack28(initOffset,in,out); + + case 29: return iunpack29(initOffset,in,out); + + case 30: return iunpack30(initOffset,in,out); + + case 31: return iunpack31(initOffset,in,out); + + case 32: return iunpack32(initOffset,in,out); + + default: break; + } + throw std::logic_error("number of bits is unsupported"); +} + + +template +inline __m128i SIMDipatchedunpack(__m128i initOffset, const __m128i * in, uint32_t * out, const __m128i * patchedbuffer, const uint32_t bit) { + switch(bit) { + case 0: return ipatchedunpack0(initOffset,in,out,patchedbuffer); + + case 1: return ipatchedunpack1(initOffset,in,out,patchedbuffer); + + case 2: return ipatchedunpack2(initOffset,in,out,patchedbuffer); + + case 3: return ipatchedunpack3(initOffset,in,out,patchedbuffer); + + case 4: return ipatchedunpack4(initOffset,in,out,patchedbuffer); + + case 5: return ipatchedunpack5(initOffset,in,out,patchedbuffer); + + case 6: return ipatchedunpack6(initOffset,in,out,patchedbuffer); + + case 7: return ipatchedunpack7(initOffset,in,out,patchedbuffer); + + case 8: return ipatchedunpack8(initOffset,in,out,patchedbuffer); + + case 9: return ipatchedunpack9(initOffset,in,out,patchedbuffer); + + case 10: return ipatchedunpack10(initOffset,in,out,patchedbuffer); + + case 11: return ipatchedunpack11(initOffset,in,out,patchedbuffer); + + case 12: return ipatchedunpack12(initOffset,in,out,patchedbuffer); + + case 13: return ipatchedunpack13(initOffset,in,out,patchedbuffer); + + case 14: return ipatchedunpack14(initOffset,in,out,patchedbuffer); + + case 15: return ipatchedunpack15(initOffset,in,out,patchedbuffer); + + case 16: return ipatchedunpack16(initOffset,in,out,patchedbuffer); + + case 17: return ipatchedunpack17(initOffset,in,out,patchedbuffer); + + case 18: return ipatchedunpack18(initOffset,in,out,patchedbuffer); + + case 19: return ipatchedunpack19(initOffset,in,out,patchedbuffer); + + case 20: return ipatchedunpack20(initOffset,in,out,patchedbuffer); + + case 21: return ipatchedunpack21(initOffset,in,out,patchedbuffer); + + case 22: return ipatchedunpack22(initOffset,in,out,patchedbuffer); + + case 23: return ipatchedunpack23(initOffset,in,out,patchedbuffer); + + case 24: return ipatchedunpack24(initOffset,in,out,patchedbuffer); + + case 25: return ipatchedunpack25(initOffset,in,out,patchedbuffer); + + case 26: return ipatchedunpack26(initOffset,in,out,patchedbuffer); + + case 27: return ipatchedunpack27(initOffset,in,out,patchedbuffer); + + case 28: return ipatchedunpack28(initOffset,in,out,patchedbuffer); + + case 29: return ipatchedunpack29(initOffset,in,out,patchedbuffer); + + case 30: return ipatchedunpack30(initOffset,in,out,patchedbuffer); + + case 31: return ipatchedunpack31(initOffset,in,out,patchedbuffer); + + case 32: return ipatchedunpack32(initOffset,in,out,patchedbuffer); + + default: break; + } + throw std::logic_error("number of bits is unsupported"); +} + + + /*assumes that integers fit in the prescribed number of bits*/ +template +void SIMDipackwithoutmask(__m128i initOffset, const uint32_t * in, __m128i * out, const uint32_t bit) { + switch(bit) { + case 0: return; + + case 1: ipackwithoutmask1(initOffset,in,out); return; + + case 2: ipackwithoutmask2(initOffset,in,out); return; + + case 3: ipackwithoutmask3(initOffset,in,out); return; + + case 4: ipackwithoutmask4(initOffset,in,out); return; + + case 5: ipackwithoutmask5(initOffset,in,out); return; + + case 6: ipackwithoutmask6(initOffset,in,out); return; + + case 7: ipackwithoutmask7(initOffset,in,out); return; + + case 8: ipackwithoutmask8(initOffset,in,out); return; + + case 9: ipackwithoutmask9(initOffset,in,out); return; + + case 10: ipackwithoutmask10(initOffset,in,out); return; + + case 11: ipackwithoutmask11(initOffset,in,out); return; + + case 12: ipackwithoutmask12(initOffset,in,out); return; + + case 13: ipackwithoutmask13(initOffset,in,out); return; + + case 14: ipackwithoutmask14(initOffset,in,out); return; + + case 15: ipackwithoutmask15(initOffset,in,out); return; + + case 16: ipackwithoutmask16(initOffset,in,out); return; + + case 17: ipackwithoutmask17(initOffset,in,out); return; + + case 18: ipackwithoutmask18(initOffset,in,out); return; + + case 19: ipackwithoutmask19(initOffset,in,out); return; + + case 20: ipackwithoutmask20(initOffset,in,out); return; + + case 21: ipackwithoutmask21(initOffset,in,out); return; + + case 22: ipackwithoutmask22(initOffset,in,out); return; + + case 23: ipackwithoutmask23(initOffset,in,out); return; + + case 24: ipackwithoutmask24(initOffset,in,out); return; + + case 25: ipackwithoutmask25(initOffset,in,out); return; + + case 26: ipackwithoutmask26(initOffset,in,out); return; + + case 27: ipackwithoutmask27(initOffset,in,out); return; + + case 28: ipackwithoutmask28(initOffset,in,out); return; + + case 29: ipackwithoutmask29(initOffset,in,out); return; + + case 30: ipackwithoutmask30(initOffset,in,out); return; + + case 31: ipackwithoutmask31(initOffset,in,out); return; + + case 32: ipackwithoutmask32(initOffset,in,out); return; + + default: break; + } + throw std::logic_error("number of bits is unsupported"); +} + + + +template +void SIMDipack(__m128i initOffset, const uint32_t * in, __m128i * out, const uint32_t bit) { + switch(bit) { + case 0: return; + + case 1: ipack1(initOffset, in,out); return; + + case 2: ipack2(initOffset, in,out); return; + + case 3: ipack3(initOffset, in,out); return; + + case 4: ipack4(initOffset, in,out); return; + + case 5: ipack5(initOffset, in,out); return; + + case 6: ipack6(initOffset, in,out); return; + + case 7: ipack7(initOffset, in,out); return; + + case 8: ipack8(initOffset, in,out); return; + + case 9: ipack9(initOffset, in,out); return; + + case 10: ipack10(initOffset, in,out); return; + + case 11: ipack11(initOffset, in,out); return; + + case 12: ipack12(initOffset, in,out); return; + + case 13: ipack13(initOffset, in,out); return; + + case 14: ipack14(initOffset, in,out); return; + + case 15: ipack15(initOffset, in,out); return; + + case 16: ipack16(initOffset, in,out); return; + + case 17: ipack17(initOffset, in,out); return; + + case 18: ipack18(initOffset, in,out); return; + + case 19: ipack19(initOffset, in,out); return; + + case 20: ipack20(initOffset, in,out); return; + + case 21: ipack21(initOffset, in,out); return; + + case 22: ipack22(initOffset, in,out); return; + + case 23: ipack23(initOffset, in,out); return; + + case 24: ipack24(initOffset, in,out); return; + + case 25: ipack25(initOffset, in,out); return; + + case 26: ipack26(initOffset, in,out); return; + + case 27: ipack27(initOffset, in,out); return; + + case 28: ipack28(initOffset, in,out); return; + + case 29: ipack29(initOffset, in,out); return; + + case 30: ipack30(initOffset, in,out); return; + + case 31: ipack31(initOffset, in,out); return; + + case 32: ipack32(initOffset, in,out); return; + + default: break; + } + throw std::logic_error("number of bits is unsupported"); +} + +#endif diff --git a/include/skipping.h b/include/skipping.h new file mode 100644 index 0000000..a176f96 --- /dev/null +++ b/include/skipping.h @@ -0,0 +1,274 @@ +/* + * This is a simple implementation of the Skipping data structure and algorithms similar to + * what is described in + * + * Sanders and Transier, Intersection in Integer Inverted Indices, 2007. + * + * As suggested in their conclusion, we leave the higher-level structure uncompressed. We also + * use differential coding. + * + * Sanders and Transier's proposal is similar in spirit to the skipping structure proposed in + * + * Moffat, A., Zobel, J.: Self-indexing inverted files for fast text retrieval. + * ACM Transactions on Information Systems 14 (1996). + * + * + * Author: Daniel Lemire + */ + +#ifndef SKIPPING_H_ +#define SKIPPING_H_ + +#include "common.h" + +class Skipping { +public: + + + Skipping(uint32_t BS, const uint32_t * data, uint32_t length) : + BlockSizeLog(BS), + mainbuffer(), highbuffer(), Length(0) { + if((BlockSizeLog == 0) && (BlockSizeLog >= 32)) throw runtime_error("please use a reasonable BlockSizeLog"); + load(data, length);// cheap constructor + } + + + ~Skipping() {} + + size_t storageInBytes() const { + return mainbuffer.size() * sizeof(uint8_t) + + highbuffer.size() * sizeof(higharraypair) + + sizeof(Length); // rough estimates (good enough) + } + + uint32_t decompress(uint32_t * out) const { + const uint8_t * bout = mainbuffer.data(); + uint32_t pos = 0; + + uint32_t val = 0; + for(uint32_t k = 0; k < Length;++k) { + bout = decode(bout,val); + out[pos++] = val; + } + return pos; + } + + + /** + * Intersects the current Skipping structure with a (small) uncompressed array and + * writes the answer to out. + */ + uint32_t intersect(const uint32_t * smallarray, uint32_t length, uint32_t * out) const { + uint32_t intersectsize = 0; + const uint8_t * largemainpointer = mainbuffer.data(); + uint32_t largemainval = 0; + largemainpointer = decode(largemainpointer, largemainval); + uint32_t x = 0; + for(uint32_t k = 0; k < length; ++k) { + uint32_t val = smallarray[k]; + // if the last value of the current block is too small, skip the block entirely + if (highbuffer[x >> BlockSizeLog].first < val) { + do { + x = ((x >> BlockSizeLog) + 1) << BlockSizeLog; + if (x >= Length) { + return intersectsize; + } + } while (highbuffer[x >> BlockSizeLog].first < val); + largemainpointer = mainbuffer.data() + + highbuffer[x >> BlockSizeLog].second; + largemainval = highbuffer[(x >> BlockSizeLog)-1].first; + largemainpointer = decode(largemainpointer, largemainval); + } + // at this point, we have that the last value of the current block is >= val + // this means that we shall decode at most one block before giving up + while (largemainval < val) { + ++x; + if (x >= Length) { + return intersectsize; + } + largemainpointer = decode(largemainpointer, largemainval); + } + if (largemainval == val) { + out[intersectsize++] = val; + } + } + return intersectsize; + + } + uint32_t intersect(const Skipping & otherlarger, uint32_t * out) const { + // we assume that "this" is the smallest of the two + if (otherlarger.Length < Length) + return otherlarger.intersect(*this, out); + if (Length == 0) + return 0;// special silly case + assert(otherlarger.Length>=Length); + assert(otherlarger.Length>0); + uint32_t intersectsize = 0; + + const uint8_t * inbyte = mainbuffer.data(); + const uint8_t * const endbyte = mainbuffer.data() + + mainbuffer.size(); + const uint8_t * largemainpointer = otherlarger.mainbuffer.data(); + uint32_t largemainval = 0; + largemainpointer = decode(largemainpointer, largemainval); + uint32_t val = 0;// where I put decoded values + uint32_t x = 0; + while (endbyte > inbyte) { + inbyte = decode(inbyte, val); + // if the last value of the current block is too small, skip the block entirely + if (otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].first < val) { + do { + x = ((x >> otherlarger.BlockSizeLog) + 1) << otherlarger.BlockSizeLog; + if (x >= otherlarger.Length) { + return intersectsize; + } + } while (otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].first < val); + largemainpointer = otherlarger.mainbuffer.data() + + otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].second; + largemainval = otherlarger.highbuffer[(x >> otherlarger.BlockSizeLog)-1].first; + largemainpointer = decode(largemainpointer, largemainval); + } + // at this point, we have that the last value of the current block is >= val + // this means that we shall decode at most one block before giving up + while (largemainval < val) { + ++x; + if (x >= otherlarger.Length) { + return intersectsize; + } + largemainpointer = decode(largemainpointer, largemainval); + } + if (largemainval == val) { + out[intersectsize++] = val; + } + } + return intersectsize; + } + + uint32_t BlockSizeLog; + vector mainbuffer; + typedef pair higharraypair; + + typedef vector higharray; + higharray highbuffer; + uint32_t Length; + + // please don't use the default constructor... + + Skipping() : BlockSizeLog(0), + mainbuffer(), highbuffer(), Length(0){} +private: + + Skipping(const Skipping &); + + // making it private on purpose + Skipping & operator=(const Skipping &); + + void load(const uint32_t * data, uint32_t length); + + template + uint8_t extract7bits(const uint32_t val) { + return static_cast ((val >> (7 * i)) & ((1U << 7) - 1)); + } + + template + uint8_t extract7bitsmaskless(const uint32_t val) { + return static_cast ((val >> (7 * i))); + } + static inline const uint8_t * decode(const uint8_t * buffer, uint32_t& prev) { + // manually unrolled for performance + uint32_t v = 0; + uint8_t c = *buffer++; + v += (c & 127) ; + if ((c & 128)) { + prev += v; + return buffer; + } + c = *buffer++; + v += ((c & 127) << 7); + if ((c & 128)) { + prev += v; + return buffer; + } + c = *buffer++; + v += ((c & 127) << 14); + if ((c & 128)) { + prev += v; + return buffer; + } + c = *buffer++; + v += ((c & 127) << 21); + if ((c & 128)) { + prev += v; + return buffer; + } + c = *buffer++; + v += ((c & 127) << 30); + prev += v; + return buffer; + } +}; + +void Skipping::load(const uint32_t * data, uint32_t len) { + assert(len < (numeric_limits::max() / 5));// check for overflow + Length = len; + if(Length == 0) return; // nothing to do + uint32_t BlockNumber = (Length + (1<= Length); + highbuffer.resize(BlockNumber); + mainbuffer.resize(5 * Length); + uint8_t * bout = mainbuffer.data(); + uint8_t * const boutinit = bout; + uint32_t prev = 0; + for (uint32_t k = 0; k < BlockNumber; ++k) { + const uint32_t howmany = (((k + 1) << BlockSizeLog) > Length) ? + Length - (k << BlockSizeLog) + : 1 << BlockSizeLog; + highbuffer[k] = make_pair(data[(k << BlockSizeLog) + howmany - 1], + static_cast (bout - boutinit)); + for (uint32_t x = 0; x < howmany; ++x) { + const uint32_t v = data[x + (k << BlockSizeLog)]; + const uint32_t val = v - prev; + prev = v; + if (val < (1U << 7)) { + *bout = static_cast (val | (1U << 7)); + ++bout; + } else if (val < (1U << 14)) { + *bout = extract7bits<0> (val); + ++bout; + *bout = extract7bitsmaskless<1> (val) | (1U << 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = extract7bits<0> (val); + ++bout; + *bout = extract7bits<1> (val); + ++bout; + *bout = extract7bitsmaskless<2> (val) | (1U << 7); + ++bout; + } else if (val < (1U << 28)) { + *bout = extract7bits<0> (val); + ++bout; + *bout = extract7bits<1> (val); + ++bout; + *bout = extract7bits<2> (val); + ++bout; + *bout = extract7bitsmaskless<3> (val) | (1U << 7); + ++bout; + } else { + *bout = extract7bits<0> (val); + ++bout; + *bout = extract7bits<1> (val); + ++bout; + *bout = extract7bits<2> (val); + ++bout; + *bout = extract7bits<3> (val); + ++bout; + *bout = extract7bitsmaskless<4> (val) | (1U << 7); + ++bout; + } + } + } + mainbuffer.resize(static_cast (bout - boutinit)); + mainbuffer.shrink_to_fit(); +} + +#endif /* SKIPPING_H_ */ diff --git a/include/sortedbitpacking.h b/include/sortedbitpacking.h new file mode 100644 index 0000000..cf8af0b --- /dev/null +++ b/include/sortedbitpacking.h @@ -0,0 +1,258 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#ifndef SORTEDBITPACKING_H_ +#define SORTEDBITPACKING_H_ + + +#include "common.h" +#include "simdbitpacking.h" +#include "bitpackinghelpers.h" + + +template +__attribute__ ((const)) +static T * padTo128bits(T * inbyte) { + return reinterpret_cast ((reinterpret_cast (inbyte) + + 15) & ~15); +} + +/** + * This is a minimalist class that allows you to store data + * in one of 32 "stores". Each store is for + * integers having bit width 1, 2..., 32 respectively. + * + * Design by D. Lemire + */ +template +class BasicSortedBitPacker { +public: + + enum{DEFAULTSIZE = 128};// should be a multiple of 128 + + static string name() { + return forcealign ? "BSBP":"uBSBP"; + } + + BasicSortedBitPacker() { + for(uint32_t i = 0; i < 32;++i) { + data[i] = new uint32_t[DEFAULTSIZE]; + memset(data[i],0,DEFAULTSIZE*sizeof(uint32_t)); + actualsizes[i] = DEFAULTSIZE; + } + clear(); + } + + void reset() { + for(uint32_t i = 0; i < 32;++i) { + delete[] data[i]; + data[i] = new uint32_t[DEFAULTSIZE]; + memset(data[i],0,DEFAULTSIZE*sizeof(uint32_t)); + actualsizes[i] = DEFAULTSIZE; + } + clear(); + } + + ~BasicSortedBitPacker() { + free(); + } + void free() { + clear(); + for(uint32_t i = 0; i < 32;++i) + if( data[i] != NULL) { + delete[] data[i]; + data[i] = NULL; + actualsizes[i] = 0; + } + } + void directAppend(uint32_t i, uint32_t val) { + data[i][sizes[i]++] = val; + } + + const uint32_t * get(int i) { + return data[i]; + } + + void ensureCapacity(int i, uint32_t datatoadd) { + if(sizes[i]+datatoadd>actualsizes[i]) { + actualsizes[i] = (sizes[i]+datatoadd+127)/128*128*2;// so we always get a multiple of 128 + uint32_t * tmp = new uint32_t[actualsizes[i]]; + for(uint32_t j = 0; j< sizes[i];++j) + tmp[j]= data[i][j]; + delete[] data[i]; + data[i] = tmp; + } + } + + void clear() { + for(uint32_t i = 0; i < 32;++i) + sizes[i] = 0;// memset "might" be faster. + } + + /** + * The data structure can be slightly inefficient sometimes. This + * reports the number of "padded" bits (wasted bits) used. + */ + uint32_t computeWaste() { + uint32_t answer = 0; + for (uint32_t k = 0; k < 32; ++k) { + if (sizes[k] != 0) { + uint32_t j = sizes[k]/128*128; + if(j(out), k+1); + out += 4 * (k+1); + } else { + usimdpackwithoutmask(&data[k][j], reinterpret_cast<__m128i *>(out), k+1); + out += 4 * (k+1); + } + } + if(j(out), k+1); + out += 4 * (k+1); + } else { + // falling back on scalar + for(;j(in), &data[k][j], k+1); + in += 4 * (k+1); + } else { + usimdunpack(reinterpret_cast(in), &data[k][j], k+1); + in += 4 * (k+1); + } + } + if(j(in), &data[k][j], k+1); + in += 4 * (k+1); + } else { + // falling back on scalar + for(;jactualsizes[k]) { + cerr<<"overflow at "<k+1) { + cerr<<"At "<<(k+1)<<" we have "<0) { + cout< generateArray(uint32_t N, const uint32_t mask = 0xFFFFFFFFU) { + vector < uint32_t> ans(N); + for (size_t k = 0; k < N; ++k) + ans[k] = rand() & mask; + return ans; +} + +vector generateArray32(uint32_t N, const uint32_t mask = 0xFFFFFFFFU) { + vector < uint32_t> ans(N); + for (size_t k = 0; k < N; ++k) + ans[k] = rand() & mask; + return ans; +} + + +class UniformDataGenerator { +public: + UniformDataGenerator(uint32_t seed = static_cast(time(NULL))) : + rand(seed) { + } + + void negate(vector & in , vector & out, uint32_t Max) { + out.resize(Max-in.size()); + in.push_back(Max); + uint32_t i = 0; + size_t c = 0; + for(size_t j = 0; j < in.size() ; ++j) { + const uint32_t v = in[j]; + for(; i generateUniformHash(uint32_t N, uint32_t Max, vector < uint32_t > & ans) { + if(Max < N) throw runtime_error("can't generate enough distinct elements in small interval"); + ans.clear(); + if(N==0) return ans; // nothing to do + ans.reserve(N); + assert(Max >= 1); + unordered_set s; + while (s.size() < N ) + s.insert(rand.getValue(Max - 1) ); + ans.assign(s.begin(), s.end()); + sort(ans.begin(),ans.end()); + assert(N == ans.size()); + return ans; + } + + + void generateUniformBitmap(uint32_t N, uint32_t Max, vector & ans) { + if(Max < N) throw runtime_error("can't generate enough distinct elements in small interval"); + assert(Max >= 1); + BoolArray bs(Max); + uint32_t card = 0; + while(card < N) { + uint32_t v = rand.getValue(Max - 1) ; + if(!bs.get(v)) { + bs.set(v); + ++card; + } + } + ans.resize(N); + bs.toArray(ans); + } + + + void fastgenerateUniform(uint32_t N, uint32_t Max, vector & ans) { + if (2*N > Max) { + vector buf(N); + fastgenerateUniform(Max-N,Max,buf); + negate(buf,ans,Max); + return; + } + if(N*1024 > Max) { + generateUniformBitmap(N,Max,ans); + } + generateUniformHash(N,Max,ans); + } + + + + // Max value is excluded from range + vector generate(uint32_t N, uint32_t Max) { + vector ans; + ans.reserve(N); + fastgenerateUniform(N,Max,ans); + return ans; + } + ZRandom rand; + +}; + + +class ClusteredDataGenerator { +public: + + vector buffer; + UniformDataGenerator unidg; + ClusteredDataGenerator(uint32_t seed = static_cast(time(NULL))) : + buffer(), unidg(seed) { + } + + // Max value is excluded from range + template + void fillUniform(iterator begin, iterator end, uint32_t Min, uint32_t Max) { + unidg.fastgenerateUniform(static_cast(end - begin), Max - Min,buffer); + for (size_t k = 0; k < buffer.size(); ++k) + *(begin + k) = Min + buffer[k]; + } + + + // Max value is excluded from range + // throws exception if impossible + template + void fillClustered(iterator begin, iterator end, uint32_t Min, uint32_t Max) { + const uint32_t N = static_cast(end - begin); + const uint32_t range = Max - Min; + if(range < N) throw runtime_error("can't generate that many in small interval."); + assert(range >= N); + if ((range == N) or (N < 10)) { + fillUniform(begin, end, Min, Max); + return; + } + const uint32_t cut = N / 2 + unidg.rand.getValue( range - N ); + assert(cut >= N / 2); + assert(Max - Min - cut >= N - N / 2); + const double p = unidg.rand.getDouble(); + assert(p <= 1); + assert(p >= 0); + if (p <= 0.25) { + fillUniform(begin, begin + N / 2, Min, Min + cut); + fillClustered(begin + N / 2, end, Min + cut, Max); + } else if (p <= 0.5) { + fillClustered(begin, begin + N / 2, Min, Min + cut); + fillUniform(begin + N / 2, end, Min + cut, Max); + } else { + fillClustered(begin, begin + N / 2, Min, Min + cut); + fillClustered(begin + N / 2, end, Min + cut, Max); + } + } + + // Max value is excluded from range + vector generate(uint32_t N, uint32_t Max) { + return generateClustered(N,Max); + } + + // Max value is excluded from range + vector generateClustered(uint32_t N, uint32_t Max) { + vector < uint32_t > ans(N); + fillClustered(ans.begin(), ans.end(), 0, Max); + return ans; + } + +}; + +class ZipfianGenerator { +public: + + uint32_t n; + double zetan, theta; + vector proba; + + ZRandom rand; + ZipfianGenerator(uint32_t seed = static_cast(time(NULL))) : + n(0), zetan(0), theta(0), proba(n), rand(seed) { + } + + void init(int _items, double _zipfianconstant = 1.0) { + n = _items; + if(_items == 0) throw runtime_error("no items?"); + theta = _zipfianconstant; + if (theta > 0) { + zetan = 1 / zeta(n, theta); + proba.clear(); + proba.resize(n, 0); + proba[0] = zetan; + for (uint32_t i = 1; i < n; ++i) + proba[i] = proba[i - 1] + zetan / pow(i + 1, theta); + } else { + proba.resize(n, 1.0 / n); + } + } + + void seed(uint32_t s) { + rand.seed(s); + } + + ZipfianGenerator(int _items, double _zipfianconstant, + uint32_t seed = static_cast(time(NULL))) : + n(_items), zetan(0), theta(_zipfianconstant), proba(n), rand(seed) { + init(_items, _zipfianconstant); + } + + double zeta(int n, double theta) { + double sum = 0; + for (long i = 0; i < n; i++) { + sum += 1 / (pow(i + 1, theta)); + } + return sum; + } + int nextInt() { + // Map z to the value + const double u = rand.getDouble(); + return static_cast(lower_bound(proba.begin(), proba.end(), u) - proba.begin()); + } + +}; + +vector generateZipfianArray32(uint32_t N, double power, + const uint32_t mask = 0xFFFFFFFFU) { + vector ans(N); + ZipfianGenerator zipf; + const uint32_t MAXVALUE = 1U << 22; + zipf.init(mask > MAXVALUE-1 ? MAXVALUE : mask + 1, power); + for (size_t k = 0; k < N; ++k) + ans[k] = zipf.nextInt(); + return ans; +} + + + +size_t unite(const uint32_t * set1, const size_t length1, + const uint32_t * set2, const size_t length2, uint32_t * out) { + size_t pos = 0; + size_t k1 = 0, k2 = 0; + if (0 == length1) { + for (size_t k = 0; k < length2; ++k) + out[k] = set2[k]; + return length2; + } + if (0 == length2) { + for (size_t k = 0; k < length1; ++k) + out[k] = set1[k]; + return length1; + } + while (true) { + if (set1[k1] < set2[k2]) { + out[pos++] = set1[k1]; + ++k1; + if (k1 >= length1) { + for (; k2 < length2; ++k2) + out[pos++] = set2[k2]; + break; + } + } else if (set1[k1] == set2[k2]) { + out[pos++] = set1[k1]; + ++k1; + ++k2; + if (k1 >= length1) { + for (; k2 < length2; ++k2) + out[pos++] = set2[k2]; + break; + } + if (k2 >= length2) { + for (; k1 < length1; ++k1) + out[pos++] = set1[k1]; + break; + } + } else {// if (set1[k1]>set2[k2]) { + out[pos++] = set2[k2]; + ++k2; + if (k2 >= length2) { + for (; k1 < length1; ++k1) + out[pos++] = set1[k1]; + break; + } + } + } + return pos; +} + + +vector unite(const vector & x, const vector & y) { + vector ans (x.size() + y.size()); + ans.resize(unite(x.data(),x.size(), y.data(),y.size(), ans.data())); + return ans; +} + +size_t classicalintersection(const uint32_t * set1, + const size_t length1, const uint32_t * set2, const size_t length2, uint32_t * out) { + if ((0 == length1) or (0 == length2)) + return 0; + size_t answer = 0; + size_t k1 = 0, k2 = 0; + while (true) { + if (set1[k1] < set2[k2]) { + ++k1; + if (k1 == length1) + return answer; + } else if (set2[k2] < set1[k1]) { + ++k2; + if (k2 == length2) + return answer; + } else { + // (set2[k2] == set1[k1]) + out[answer++] = set1[k1]; + ++k1; + if (k1 == length1) + break; + ++k2; + if (k2 == length2) + break; + + } + } + return answer; + +} + + + +vector intersect(const vector & x, const vector & y) { + vector ans (x.size() + y.size()); + ans.resize(classicalintersection(x.data(),x.size(), y.data(),y.size(), ans.data())); + return ans; +} +/** + * Generate a pair of arrays. One small, one larger. + * + * minlength: length of the smallest of the two arrays + * Max is the largest possible value + * sizeratio * minlength : length of the largest of the two arrays + * intersectionratio * minlength : length of the intersection + */ +template +pair,vector > getPair(generator gen, uint32_t minlength,uint32_t Max, float sizeratio, float intersectionratio) { + if(sizeratio < 1) throw runtime_error("sizeratio should be larger or equal to 1"); + if(intersectionratio < 0) throw runtime_error("intersectionratio should be positive"); + if(intersectionratio > 1) throw runtime_error("intersectionratio cannot be larger than 1"); + const uint32_t maxlenth = static_cast(round(static_cast(minlength) * sizeratio)); + if(maxlenth > Max) throw runtime_error("I can't generate an array so large in such a small range."); + if(maxlenth < minlength) throw runtime_error("something went wrong, possibly an overflow."); + // we basically assume that, if we do nothing, intersections are very small + const uint32_t intersize = static_cast(round (static_cast(minlength) * intersectionratio)); + + vector inter = gen.generate(intersize,Max); + vector smallest = unite(gen.generate(static_cast(minlength-inter.size()),Max),inter); + vector largest = unite(gen.generate(static_cast(maxlenth-inter.size()),Max),inter); + vector intersection = intersect(smallest,largest); + + if(abs(static_cast(intersection.size()) /static_cast(smallest.size()) - intersectionratio) > 0.05) + throw runtime_error("Bad intersection ratio. Fix me."); + + if(abs(static_cast(largest.size()) /static_cast(smallest.size()) - sizeratio) > 0.05) + throw runtime_error("Bad size ratio. Fix me."); + return pair,vector >(smallest,largest); +} + + +#endif /* SYNTHETIC_H_ */ diff --git a/include/timer.h b/include/timer.h new file mode 100644 index 0000000..74a9931 --- /dev/null +++ b/include/timer.h @@ -0,0 +1,89 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ + +#ifndef TIMER_H_ +#define TIMER_H_ + + +#include +#include +#include + + +class WallClockTimer { +public: +#ifdef _WIN32 + typedef qpc_clock clock; +#else + typedef std::chrono::high_resolution_clock clock; +#endif + + std::chrono::time_point t1, t2; + WallClockTimer() : + t1(), t2() { + t1 = clock::now(); + t2 = t1; + } + void reset() { + t1 = clock::now(); + t2 = t1; + } + uint64_t elapsed() { + std::chrono::microseconds delta = std::chrono::duration_cast(t2 - t1); + return delta.count(); + } + uint64_t split() { + t2 = clock::now(); + return elapsed(); + } +}; + + +#ifndef _WIN32 + +class CPUTimer { +public: + //clock_t t1, t2; + struct rusage t1,t2; + + CPUTimer() : + t1(), t2() { + getrusage(RUSAGE_SELF, &t1); + //t1 = clock(); + t2 = t1; + } + void reset() { + getrusage(RUSAGE_SELF, &t1); + t2 = t1; + } + // proxy for userelapsed + uint64_t elapsed() { + return totalelapsed(); + } + + uint64_t totalelapsed() { + return userelapsed() + systemelapsed(); + } + // returns the *user* CPU time in micro seconds (mu s) + uint64_t userelapsed() { + return ((t2.ru_utime.tv_sec - t1.ru_utime.tv_sec) * 1000ULL * 1000ULL) + ((t2.ru_utime.tv_usec - t1.ru_utime.tv_usec) + ); + } + + // returns the *system* CPU time in micro seconds (mu s) + uint64_t systemelapsed() { + return ((t2.ru_stime.tv_sec - t1.ru_stime.tv_sec) * 1000ULL * 1000ULL) + ((t2.ru_stime.tv_usec - t1.ru_stime.tv_usec) + ); + } + + uint64_t split() { + getrusage(RUSAGE_SELF, &t2); + return elapsed(); + } +}; +#endif + +#endif /* TIMER_H_ */ diff --git a/include/usimdbitpacking.h b/include/usimdbitpacking.h new file mode 100644 index 0000000..0e5633d --- /dev/null +++ b/include/usimdbitpacking.h @@ -0,0 +1,118 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire + */ +#ifndef USIMDBITPACKING_H_ +#define USIMDBITPACKING_H_ + +#include "common.h" + + +void __uSIMD_fastunpack1(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack2(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack3(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack4(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack5(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack6(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack7(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack8(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack9(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack10(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack11(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack12(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack13(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack14(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack15(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack16(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack17(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack18(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack19(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack20(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack21(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack22(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack23(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack24(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack25(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack26(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack27(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack28(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack29(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack30(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack31(const __m128i* , uint32_t * ); +void __uSIMD_fastunpack32(const __m128i* , uint32_t * ); + + +void __uSIMD_fastpackwithoutmask0(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask1(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask2(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask3(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask4(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask5(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask6(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask7(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask8(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask9(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask10(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask11(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask12(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask13(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask14(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask15(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask16(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask17(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask18(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask19(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask20(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask21(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask22(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask23(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask24(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask25(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask26(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask27(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask28(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask29(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask30(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask31(const uint32_t * , __m128i * ); +void __uSIMD_fastpackwithoutmask32(const uint32_t * , __m128i * ); + +void __uSIMD_fastpack0(const uint32_t * , __m128i * ); +void __uSIMD_fastpack1(const uint32_t * , __m128i * ); +void __uSIMD_fastpack2(const uint32_t * , __m128i * ); +void __uSIMD_fastpack3(const uint32_t * , __m128i * ); +void __uSIMD_fastpack4(const uint32_t * , __m128i * ); +void __uSIMD_fastpack5(const uint32_t * , __m128i * ); +void __uSIMD_fastpack6(const uint32_t * , __m128i * ); +void __uSIMD_fastpack7(const uint32_t * , __m128i * ); +void __uSIMD_fastpack8(const uint32_t * , __m128i * ); +void __uSIMD_fastpack9(const uint32_t * , __m128i * ); +void __uSIMD_fastpack10(const uint32_t * , __m128i * ); +void __uSIMD_fastpack11(const uint32_t * , __m128i * ); +void __uSIMD_fastpack12(const uint32_t * , __m128i * ); +void __uSIMD_fastpack13(const uint32_t * , __m128i * ); +void __uSIMD_fastpack14(const uint32_t * , __m128i * ); +void __uSIMD_fastpack15(const uint32_t * , __m128i * ); +void __uSIMD_fastpack16(const uint32_t * , __m128i * ); +void __uSIMD_fastpack17(const uint32_t * , __m128i * ); +void __uSIMD_fastpack18(const uint32_t * , __m128i * ); +void __uSIMD_fastpack19(const uint32_t * , __m128i * ); +void __uSIMD_fastpack20(const uint32_t * , __m128i * ); +void __uSIMD_fastpack21(const uint32_t * , __m128i * ); +void __uSIMD_fastpack22(const uint32_t * , __m128i * ); +void __uSIMD_fastpack23(const uint32_t * , __m128i * ); +void __uSIMD_fastpack24(const uint32_t * , __m128i * ); +void __uSIMD_fastpack25(const uint32_t * , __m128i * ); +void __uSIMD_fastpack26(const uint32_t * , __m128i * ); +void __uSIMD_fastpack27(const uint32_t * , __m128i * ); +void __uSIMD_fastpack28(const uint32_t * , __m128i * ); +void __uSIMD_fastpack29(const uint32_t * , __m128i * ); +void __uSIMD_fastpack30(const uint32_t * , __m128i * ); +void __uSIMD_fastpack31(const uint32_t * , __m128i * ); +void __uSIMD_fastpack32(const uint32_t * , __m128i * ); + + + + +#endif /* SIMDBITPACKING_H_ */ diff --git a/include/util.h b/include/util.h new file mode 100644 index 0000000..83ba846 --- /dev/null +++ b/include/util.h @@ -0,0 +1,129 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire + */ + +#ifndef UTIL_H_ +#define UTIL_H_ + +#include "common.h" + +inline uint32_t random(int b) { + if(b==32) return rand(); + return rand() % (1U< (tmparray), accumulator); + return gccbits(tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]); +} + +// for clarity +#define min(X, Y) ((X) < (Y) ? (X) : (Y)) + + +static __attribute__ ((const)) +bool divisibleby(size_t a, uint32_t x) { + return (a % x == 0); +} + +#ifdef __GNUC__ +__attribute__ ((unused)) +#endif +static void checkifdivisibleby(size_t a, uint32_t x) { + if (!divisibleby(a, x)) { + std::ostringstream convert; + convert << a << " not divisible by " << x; + throw std::logic_error(convert.str()); + } +} + + + +template +__attribute__ ((pure)) +uint32_t maxbits(const iterator & begin, const iterator & end) { + uint32_t accumulator = 0; + for (iterator k = begin; k != end; ++k) { + accumulator |= *k; + } + return gccbits(accumulator); +} + + +template +__attribute__ ((const)) +inline bool needPaddingTo128Bits(const T * inbyte) { + return reinterpret_cast (inbyte) & 15; +} + + + + +template +__attribute__ ((const)) +inline bool needPaddingTo32Bits(const T * inbyte) { + return reinterpret_cast (inbyte) & 3; +} + +template +__attribute__ ((const)) +T * padTo32bits(T * inbyte) { + return reinterpret_cast< T *> ((reinterpret_cast (inbyte) + + 3) & ~3); +} + +template +__attribute__ ((const)) +const T * padTo32bits(const T * inbyte) { + return reinterpret_cast ((reinterpret_cast (inbyte) + + 3) & ~3); +} + + + +__attribute__ ((const)) +inline uint32_t asmbits(const uint32_t v) { + if (v == 0) + return 0; + uint32_t answer; + __asm__("bsr %1, %0;" :"=r"(answer) :"r"(v)); + return answer + 1; +} +#endif /* UTIL_H_ */ diff --git a/include/variablebyte.h b/include/variablebyte.h new file mode 100644 index 0000000..b9a342a --- /dev/null +++ b/include/variablebyte.h @@ -0,0 +1,123 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#ifndef VARIABLEBYTE_H_ +#define VARIABLEBYTE_H_ +#include "common.h" +#include "codecs.h" +#include "util.h" + + +template +class VariableByte: public IntegerCODEC { +public: + + + void encodeArray(uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + uint8_t * bout = reinterpret_cast (out); + const uint8_t * const initbout = reinterpret_cast (out); + uint32_t prev = 0; + for (size_t k = 0; k < length; ++k) { + const uint32_t val = delta? in[k] - prev: in[k]; + if(delta) prev = in[k]; + /** + * Code below could be shorter. Whether it could be faster + * depends on your compiler and machine. + */ + if (val < (1U << 7)) { + *bout = static_cast(val | (1U << 7)); + ++bout; + } else if (val < (1U << 14)) { + *bout = extract7bits<0> (val); + ++bout; + *bout = extract7bitsmaskless<1> (val) | (1U << 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = extract7bits<0> (val); + ++bout; + *bout = extract7bits<1> (val); + ++bout; + *bout = extract7bitsmaskless<2> (val) | (1U << 7); + ++bout; + } else if (val < (1U << 28)) { + *bout = extract7bits<0> (val); + ++bout; + *bout = extract7bits<1> (val); + ++bout; + *bout = extract7bits<2> (val); + ++bout; + *bout = extract7bitsmaskless<3> (val) | (1U << 7); + ++bout; + } else { + *bout = extract7bits<0> (val); + ++bout; + *bout = extract7bits<1> (val); + ++bout; + *bout = extract7bits<2> (val); + ++bout; + *bout = extract7bits<3> (val); + ++bout; + *bout = extract7bitsmaskless<4> (val) | (1U << 7); + ++bout; + } + } + while (needPaddingTo32Bits(bout)) { + *bout++ = 0; + } + const size_t storageinbytes = bout - initbout; + assert((storageinbytes % 4) == 0); + nvalue = storageinbytes / 4; + } + + const uint32_t * decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t & nvalue) { + uint32_t prev = 0; + if (length == 0) { + nvalue = 0; + return in;//abort + } + const uint8_t * inbyte = reinterpret_cast (in); + const uint8_t * const endbyte = reinterpret_cast (in + + length); + const uint32_t * const initout(out); + + while (endbyte > inbyte) { + unsigned int shift = 0; + for (uint32_t v = 0; endbyte > inbyte; shift += 7) { + uint8_t c = *inbyte++; + v += ((c & 127) << shift); + if ((c & 128)) { + *out++ = delta ? (prev = v + prev) : v; + break; + } + } + } + nvalue = out - initout; + inbyte = padTo32bits(inbyte); + return reinterpret_cast (inbyte); + } + + string name() const { + if(delta) + return "VariableByteDelta"; + else + return "VariableByte"; + } +private: + template + uint8_t extract7bits(const uint32_t val) { + return static_cast((val >> (7 * i)) & ((1U << 7) - 1)); + } + + template + uint8_t extract7bitsmaskless(const uint32_t val) { + return static_cast((val >> (7 * i))); + } +}; + +#endif /* VARIABLEBYTE_H_ */ diff --git a/makefile b/makefile new file mode 100644 index 0000000..8bb661b --- /dev/null +++ b/makefile @@ -0,0 +1,108 @@ +.SUFFIXES: +# +.SUFFIXES: .cpp .o .c .h +# replace the YOURCXX variable with a path to a C++11 compatible compiler. +ifeq ($(INTEL), 1) +# if you wish to use the Intel compiler, please do "make INTEL=1". + YOURCXX ?= /opt/intel/bin/icpc +ifeq ($(DEBUG),1) + CXXFLAGS = -std=c++11 -O3 -Wall -ansi -xAVX -DDEBUG=1 -D_GLIBCXX_DEBUG -ggdb +else + CXXFLAGS = -std=c++11 -O3 -Wall -ansi -xAVX -DNDEBUG=1 -ggdb +endif # debug +else #intel + YOURCXX ?= g++-4.7 +ifeq ($(DEBUG),1) + CXXFLAGS = -mavx -std=c++11 -Weffc++ -pedantic -ggdb -DDEBUG=1 -D_GLIBCXX_DEBUG -Wall -Wextra -Wcast-align +else + CXXFLAGS = -mavx -std=c++11 -Weffc++ -pedantic -O3 -Wall -Wextra -Wcast-align +endif #debug +endif #intel + +CXX := $(YOURCXX) + + + + +HEADERS= $(shell ls include/*h) + +all: unit testcodecs testintegration advancedbenchmarking benchintersection + echo "please run unit tests by running the unit executable" + +advancedbenchmarking: simplesynth compress uncompress budgetedtest entropy + +bitpacking.o: include/bitpacking.h src/bitpacking.cpp + $(CXX) $(CXXFLAGS) -c src/bitpacking.cpp -Iinclude + +intersection.o: include/intersection.h src/intersection.cpp + $(CXX) $(CXXFLAGS) -c src/intersection.cpp -Iinclude + +benchintersection: intersection.o src/benchintersection.cpp include/synthetic.h include/timer.h + $(CXX) $(CXXFLAGS) -o benchintersection src/benchintersection.cpp intersection.o -Iinclude + +integratedbitpacking.o: include/integratedbitpacking.h src/integratedbitpacking.cpp + $(CXX) $(CXXFLAGS) -c src/integratedbitpacking.cpp -Iinclude + + +simdbitpacking.o: include/simdbitpacking.h src/simdbitpacking.cpp + $(CXX) $(CXXFLAGS) -c src/simdbitpacking.cpp -Iinclude + +usimdbitpacking.o: include/usimdbitpacking.h src/usimdbitpacking.cpp + $(CXX) $(CXXFLAGS) -c src/usimdbitpacking.cpp -Iinclude + +simdintegratedbitpacking.o: include/simdintegratedbitpacking.h include/delta.h src/simdintegratedbitpacking.cpp + $(CXX) $(CXXFLAGS) -c src/simdintegratedbitpacking.cpp -Iinclude + + + +UNAME := $(shell uname) + + +OBJECTS= bitpacking.o integratedbitpacking.o simdbitpacking.o usimdbitpacking.o simdintegratedbitpacking.o intersection.o + + +unit: $(HEADERS) src/unit.cpp $(OBJECTS) + $(CXX) $(CXXFLAGS) -o unit src/unit.cpp $(OBJECTS) -Iinclude + +testcodecs: $(HEADERS) src/testcodecs.cpp $(OBJECTS) + $(CXX) $(CXXFLAGS) -Iinclude -o testcodecs src/testcodecs.cpp $(OBJECTS) + + + +example: $(HEADERS) example.cpp $(OBJECTS) + $(CXX) $(CXXFLAGS) -o example example.cpp $(OBJECTS) -Iinclude + + + +testintegration: bitpacking.o simdbitpacking.o usimdbitpacking.o integratedbitpacking.o simdintegratedbitpacking.o src/testintegration.cpp $(HEADERS) + $(CXX) $(CXXFLAGS) -Iinclude -o testintegration src/testintegration.cpp bitpacking.o integratedbitpacking.o simdbitpacking.o usimdbitpacking.o simdintegratedbitpacking.o + + + +clean: + rm -f *.o unit testintegration testcodecs simplesynth compress uncompress budgetedtest entropy example benchintersection + + + + + + +BENCHHEADERS= $(shell ls advancedbenchmarking/include/*h) + +simplesynth: $(HEADERS) $(BENCHHEADERS) advancedbenchmarking/src/simplesynth.cpp + $(CXX) $(CXXFLAGS) -o simplesynth advancedbenchmarking/src/simplesynth.cpp -Iinclude -Iadvancedbenchmarking/include + + +compress: $(HEADERS) $(BENCHHEADERS) advancedbenchmarking/src/compress.cpp $(OBJECTS) + $(CXX) $(CXXFLAGS) -o compress advancedbenchmarking/src/compress.cpp $(OBJECTS) -Iinclude -Iadvancedbenchmarking/include + +budgetedtest: $(HEADERS) $(BENCHHEADERS) advancedbenchmarking/src/budgetedtest.cpp $(OBJECTS) + $(CXX) $(CXXFLAGS) -o budgetedtest advancedbenchmarking/src/budgetedtest.cpp $(OBJECTS) -Iinclude -Iadvancedbenchmarking/include + +entropy: $(HEADERS) $(BENCHHEADERS) advancedbenchmarking/src/entropy.cpp $(OBJECTS) + $(CXX) $(CXXFLAGS) -o entropy advancedbenchmarking/src/entropy.cpp $(OBJECTS) -Iinclude -Iadvancedbenchmarking/include + + +uncompress: $(HEADERS) $(BENCHHEADERS) advancedbenchmarking/src/uncompress.cpp $(OBJECTS) + $(CXX) $(CXXFLAGS) -o uncompress advancedbenchmarking/src/uncompress.cpp $(OBJECTS) -Iinclude -Iadvancedbenchmarking/include + diff --git a/src/benchintersection.cpp b/src/benchintersection.cpp new file mode 100644 index 0000000..f441513 --- /dev/null +++ b/src/benchintersection.cpp @@ -0,0 +1,141 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ + + +#include +#include "synthetic.h" +#include "timer.h" +#include "intersection.h" + + +/** + * Goal: have the largest array count about 4M terms (this + * matches our experiments), and vary the size of the + * smallest array vary from 1*4M to 1/1000*4M (or so). + * + * Set the size of the intersection to 30% of the lesser + * array. (Again, this matches our real data...) + * + * To match our clueweb, we use a range of values in [0,2**26). + */ +template +pair,vector > getNaivePair(generator gen, uint32_t minlength,uint32_t Max, float sizeratio, float intersectionratio) { + if(sizeratio < 1) throw runtime_error("sizeratio should be larger or equal to 1"); + if(intersectionratio < 0) throw runtime_error("intersectionratio should be positive"); + if(intersectionratio > 1) throw runtime_error("intersectionratio cannot be larger than 1"); + const uint32_t maxlenth = static_cast(round(static_cast(minlength) * sizeratio)); + if(maxlenth > Max) throw runtime_error("I can't generate an array so large in such a small range."); + if(maxlenth < minlength) throw runtime_error("something went wrong, possibly an overflow."); + // we basically assume that, if we do nothing, intersections are very small + const uint32_t intersize = static_cast(round (static_cast(minlength) * intersectionratio)); + + vector inter = gen.generate(intersize,Max); + vector smallest = unite(gen.generate(static_cast(minlength-inter.size()),Max),inter); + vector largest = unite(gen.generate(static_cast(maxlenth-inter.size()),Max),inter); + vector intersection = intersect(smallest,largest); + if(largest.size()>smallest.size()) + return pair,vector >(smallest,largest); + return pair,vector >(largest,smallest); + +} + + + +void printusage() { + cout << " Try ./benchintersection " << endl; +} + +int main(int argc, char **argv) { + size_t howmany = 0; + size_t loop = 3; + uint32_t Big = 22; + float intersectionratio = 0.3f; + uint32_t MaxBit = 26; + int c; + while ((c = getopt(argc, argv, "ns:m:R:M:S:l:h")) != -1) + switch (c) { + case 'h': + printusage(); + return 0; + case 'S': + Big = atoi(optarg); + break; + case 'R': + intersectionratio = atof(optarg); + break; + case 'M': + intersectionratio = atoi(optarg); + if (MaxBit < 1) { + printusage(); + return -1; + } + break; + case 'm': + howmany = atoi(optarg); + if (howmany < 1) { + printusage(); + return -1; + } + break; + case 'l': + loop = atoi(optarg); + if (loop < 1) { + printusage(); + return -1; + } + break; + default: + abort(); + } + if (howmany == 0) { + howmany = 5; + } + cout<<"# howmany : "< buffer(2 * (1U << Big)); + for(string intername : IntersectionFactory::allNames()) { + cout< , vector > > data(howmany); + uint32_t smallsize = static_cast(round(static_cast (1 << Big) / ir)); + cout<<"#generating data..."; + cout.flush(); + for (size_t k = 0; k < howmany; ++k) { + data[k] = getNaivePair(cdg, smallsize,1U<(z.split()) ))<<"\t"; + } + cout<<"\t\t"<> 0 ) & 1 ; + out++; + *out = ( (*in) >> 1 ) & 1 ; + out++; + *out = ( (*in) >> 2 ) & 1 ; + out++; + *out = ( (*in) >> 3 ) & 1 ; + out++; + *out = ( (*in) >> 4 ) & 1 ; + out++; + *out = ( (*in) >> 5 ) & 1 ; + out++; + *out = ( (*in) >> 6 ) & 1 ; + out++; + *out = ( (*in) >> 7 ) & 1 ; + out++; + *out = ( (*in) >> 8 ) & 1 ; + out++; + *out = ( (*in) >> 9 ) & 1 ; + out++; + *out = ( (*in) >> 10 ) & 1 ; + out++; + *out = ( (*in) >> 11 ) & 1 ; + out++; + *out = ( (*in) >> 12 ) & 1 ; + out++; + *out = ( (*in) >> 13 ) & 1 ; + out++; + *out = ( (*in) >> 14 ) & 1 ; + out++; + *out = ( (*in) >> 15 ) & 1 ; + out++; + *out = ( (*in) >> 16 ) & 1 ; + out++; + *out = ( (*in) >> 17 ) & 1 ; + out++; + *out = ( (*in) >> 18 ) & 1 ; + out++; + *out = ( (*in) >> 19 ) & 1 ; + out++; + *out = ( (*in) >> 20 ) & 1 ; + out++; + *out = ( (*in) >> 21 ) & 1 ; + out++; + *out = ( (*in) >> 22 ) & 1 ; + out++; + *out = ( (*in) >> 23 ) & 1 ; + out++; + *out = ( (*in) >> 24 ) & 1 ; + out++; + *out = ( (*in) >> 25 ) & 1 ; + out++; + *out = ( (*in) >> 26 ) & 1 ; + out++; + *out = ( (*in) >> 27 ) & 1 ; + out++; + *out = ( (*in) >> 28 ) & 1 ; + out++; + *out = ( (*in) >> 29 ) & 1 ; + out++; + *out = ( (*in) >> 30 ) & 1 ; + out++; + *out = ( (*in) >> 31 ) ; +} + + + + +void __fastunpack2(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 30 ) ; +} + + + + +void __fastunpack3(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 3 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 27 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 29 ) ; +} + + + + +void __fastunpack5(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 5 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 27 ) ; +} + + + + +void __fastunpack6(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 26 ) ; +} + + + + +void __fastunpack7(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 7 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 25 ) ; +} + + + + +void __fastunpack9(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 9 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 9 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 23 ) ; +} + + + + +void __fastunpack10(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 22 ) ; +} + + + + +void __fastunpack11(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 11 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 11 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 21 ) ; +} + + + + +void __fastunpack12(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; +} + + + + +void __fastunpack13(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 13 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 13 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 13 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 19 ) ; +} + + + + +void __fastunpack14(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 18 ) ; +} + + + + +void __fastunpack15(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 15 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 15 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 15 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 17 ) ; +} + + + + +void __fastunpack17(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 17 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 17 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 17 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 17 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; +} + + + + +void __fastunpack18(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; +} + + + + +void __fastunpack19(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 19 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 19 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 19 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 19 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; +} + + + + +void __fastunpack20(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; +} + + + + +void __fastunpack21(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 21 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 21 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 21 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 21 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 21 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; +} + + + + +void __fastunpack22(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; +} + + + + +void __fastunpack23(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 23 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 23 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 23 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 23 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 23 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; +} + + + + +void __fastunpack24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; +} + + + + +void __fastunpack25(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 25 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 25 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 25 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 25 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 25 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 25 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; +} + + + + +void __fastunpack26(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; +} + + + + +void __fastunpack27(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 27 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 27 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 27 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 27 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 27 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 27 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; +} + + + + +void __fastunpack28(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; +} + + + + +void __fastunpack29(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 29 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 29 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 29 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 29 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 29 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 29 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 29 - 3 ); + out++; + *out = ( (*in) >> 3 ) ; +} + + + + +void __fastunpack30(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; +} + + + + +void __fastunpack31(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 31 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 31 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 31 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 31 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 31 - 3 ); + out++; + *out = ( (*in) >> 3 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 31 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 31 - 1 ); + out++; + *out = ( (*in) >> 1 ) ; +} + + + + +void __fastunpack4(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + for(uint32_t outer=0; outer< 4 ;++outer) { + for( uint32_t inwordpointer = 0 ;inwordpointer<32; inwordpointer += 4 ) + *(out++) = ( (*in) >> inwordpointer ) % (1U << 4 ) ; + ++in; + } +} + + + + +void __fastunpack8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + for(uint32_t outer=0; outer< 8 ;++outer) { + for( uint32_t inwordpointer = 0 ;inwordpointer<32; inwordpointer += 8 ) + *(out++) = ( (*in) >> inwordpointer ) % (1U << 8 ) ; + ++in; + } +} + + + + +void __fastunpack16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + for(uint32_t outer=0; outer< 16 ;++outer) { + for( uint32_t inwordpointer = 0 ;inwordpointer<32; inwordpointer += 16 ) + *(out++) = ( (*in) >> inwordpointer ) % (1U << 16 ) ; + ++in; + } +} + + + + +void __fastpack1(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) & 1 ; + ++in; + *out |= ( (*in) & 1 ) << 1 ; + ++in; + *out |= ( (*in) & 1 ) << 2 ; + ++in; + *out |= ( (*in) & 1 ) << 3 ; + ++in; + *out |= ( (*in) & 1 ) << 4 ; + ++in; + *out |= ( (*in) & 1 ) << 5 ; + ++in; + *out |= ( (*in) & 1 ) << 6 ; + ++in; + *out |= ( (*in) & 1 ) << 7 ; + ++in; + *out |= ( (*in) & 1 ) << 8 ; + ++in; + *out |= ( (*in) & 1 ) << 9 ; + ++in; + *out |= ( (*in) & 1 ) << 10 ; + ++in; + *out |= ( (*in) & 1 ) << 11 ; + ++in; + *out |= ( (*in) & 1 ) << 12 ; + ++in; + *out |= ( (*in) & 1 ) << 13 ; + ++in; + *out |= ( (*in) & 1 ) << 14 ; + ++in; + *out |= ( (*in) & 1 ) << 15 ; + ++in; + *out |= ( (*in) & 1 ) << 16 ; + ++in; + *out |= ( (*in) & 1 ) << 17 ; + ++in; + *out |= ( (*in) & 1 ) << 18 ; + ++in; + *out |= ( (*in) & 1 ) << 19 ; + ++in; + *out |= ( (*in) & 1 ) << 20 ; + ++in; + *out |= ( (*in) & 1 ) << 21 ; + ++in; + *out |= ( (*in) & 1 ) << 22 ; + ++in; + *out |= ( (*in) & 1 ) << 23 ; + ++in; + *out |= ( (*in) & 1 ) << 24 ; + ++in; + *out |= ( (*in) & 1 ) << 25 ; + ++in; + *out |= ( (*in) & 1 ) << 26 ; + ++in; + *out |= ( (*in) & 1 ) << 27 ; + ++in; + *out |= ( (*in) & 1 ) << 28 ; + ++in; + *out |= ( (*in) & 1 ) << 29 ; + ++in; + *out |= ( (*in) & 1 ) << 30 ; + ++in; + *out |= ( (*in) ) << 31 ; +} + + + + +void __fastpack2(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 2 ) ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 10 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 14 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 18 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 20 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 22 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 24 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 26 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + *out = (*in) % (1U << 2 ) ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 10 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 14 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 18 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 20 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 22 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 24 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 26 ; + ++in; + *out |= ( (*in) % (1U << 2 ) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; +} + + + + +void __fastpack3(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 3 ) ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 3 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 9 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 15 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 18 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 21 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 24 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 3 ) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 1 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 7 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 10 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 13 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 19 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 22 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 25 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 28 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 3 ) ) >> ( 3 - 2 ); + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 5 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 11 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 14 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 17 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 20 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 23 ; + ++in; + *out |= ( (*in) % (1U << 3 ) ) << 26 ; + ++in; + *out |= ( (*in) ) << 29 ; +} + + + + +void __fastpack4(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 4 ) ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 20 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) % (1U << 4 ) ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 20 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) % (1U << 4 ) ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 20 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) % (1U << 4 ) ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 20 ; + ++in; + *out |= ( (*in) % (1U << 4 ) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; +} + + + + +void __fastpack5(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 5 ) ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 5 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 10 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 15 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 20 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 5 ) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 3 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 13 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 18 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 5 ) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 1 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 11 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 21 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 26 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 5 ) ) >> ( 5 - 4 ); + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 9 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 14 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 19 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 24 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 5 ) ) >> ( 5 - 2 ); + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 7 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 17 ; + ++in; + *out |= ( (*in) % (1U << 5 ) ) << 22 ; + ++in; + *out |= ( (*in) ) << 27 ; +} + + + + +void __fastpack6(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 6 ) ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 18 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 6 ) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 10 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 6 ) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 14 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + *out = (*in) % (1U << 6 ) ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 18 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 6 ) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 10 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 6 ) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 14 ; + ++in; + *out |= ( (*in) % (1U << 6 ) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; +} + + + + +void __fastpack7(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 7 ) ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 7 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 14 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 7 ) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 3 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 10 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 17 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 7 ) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 13 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 7 ) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 9 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 16 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 23 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 7 ) ) >> ( 7 - 5 ); + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 5 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 19 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 7 ) ) >> ( 7 - 1 ); + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 1 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 15 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 22 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 7 ) ) >> ( 7 - 4 ); + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 11 ; + ++in; + *out |= ( (*in) % (1U << 7 ) ) << 18 ; + ++in; + *out |= ( (*in) ) << 25 ; +} + + + + +void __fastpack8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 8 ) ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) % (1U << 8 ) ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) % (1U << 8 ) ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) % (1U << 8 ) ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) % (1U << 8 ) ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) % (1U << 8 ) ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) % (1U << 8 ) ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) % (1U << 8 ) ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 8 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; +} + + + + +void __fastpack9(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 9 ) ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 9 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 9 ) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 13 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 9 ) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 9 ) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 3 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 12 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 9 ) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 7 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 9 ) ) >> ( 9 - 2 ); + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 11 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 20 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 9 ) ) >> ( 9 - 6 ); + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 15 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 9 ) ) >> ( 9 - 1 ); + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 1 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 10 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 19 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 9 ) ) >> ( 9 - 5 ); + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 5 ; + ++in; + *out |= ( (*in) % (1U << 9 ) ) << 14 ; + ++in; + *out |= ( (*in) ) << 23 ; +} + + + + +void __fastpack10(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 10 ) ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 10 ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 10 ) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 10 ) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 10 ) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 10 ) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + *out = (*in) % (1U << 10 ) ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 10 ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 10 ) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 10 ) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 10 ) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 10 ) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 10 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; +} + + + + +void __fastpack11(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 11 ) ; + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 11 ) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 1 ; + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 11 ) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 11 ) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 3 ; + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 11 ) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 11 ) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 5 ; + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 11 ) ) >> ( 11 - 6 ); + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 6 ; + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 17 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 11 ) ) >> ( 11 - 7 ); + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 7 ; + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 18 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 11 ) ) >> ( 11 - 8 ); + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 8 ; + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 19 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 11 ) ) >> ( 11 - 9 ); + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 9 ; + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 20 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 11 ) ) >> ( 11 - 10 ); + ++in; + *out |= ( (*in) % (1U << 11 ) ) << 10 ; + ++in; + *out |= ( (*in) ) << 21 ; +} + + + + +void __fastpack12(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 12 ) ; + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 12 ) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 12 ) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) % (1U << 12 ) ; + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 12 ) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 12 ) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) % (1U << 12 ) ; + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 12 ) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 12 ) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) % (1U << 12 ) ; + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 12 ) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 12 ) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) % (1U << 12 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; +} + + + + +void __fastpack13(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 13 ) ; + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 1 ; + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 3 ; + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 10 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 10 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 4 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 4 ; + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 17 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 11 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 11 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 5 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 5 ; + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 18 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 12 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 13 ) ) >> ( 13 - 6 ); + ++in; + *out |= ( (*in) % (1U << 13 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 19 ; +} + + + + +void __fastpack14(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 14 ) ; + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + *out = (*in) % (1U << 14 ) ; + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 2 ; + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 14 ) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) % (1U << 14 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; +} + + + + +void __fastpack15(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 15 ) ; + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 1 ; + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 16 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 14 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 14 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 12 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 10 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 10 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 8 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 6 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 4 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) % (1U << 15 ) ) >> ( 15 - 2 ); + ++in; + *out |= ( (*in) % (1U << 15 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 17 ; +} + + + + +void __fastpack16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) % (1U << 16 ) ; + ++in; + *out |= ( (*in) ) << 16 ; +} + + + + +void __fastpack17(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 17 ) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 1 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 1 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 3 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 3 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 5 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 5 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 7 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 7 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 9 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 9 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 11 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 11 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 13 ); + ++in; + *out |= ( (*in) % (1U << 17 ) ) << 13 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 17 ) ) >> ( 17 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; +} + + + + +void __fastpack18(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 18 ) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + *out = (*in) % (1U << 18 ) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) % (1U << 18 ) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 18 ) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; +} + + + + +void __fastpack19(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 19 ) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 3 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 3 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 9 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 9 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 2 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 8 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 1 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 1 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 7 ); + ++in; + *out |= ( (*in) % (1U << 19 ) ) << 7 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 19 ) ) >> ( 19 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; +} + + + + +void __fastpack20(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 20 ) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) % (1U << 20 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) % (1U << 20 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) % (1U << 20 ) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) % (1U << 20 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) % (1U << 20 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) % (1U << 20 ) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) % (1U << 20 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) % (1U << 20 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) % (1U << 20 ) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) % (1U << 20 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) % (1U << 20 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 20 ) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; +} + + + + +void __fastpack21(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 21 ) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) % (1U << 21 ) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) % (1U << 21 ) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) % (1U << 21 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) % (1U << 21 ) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) % (1U << 21 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 5 ); + ++in; + *out |= ( (*in) % (1U << 21 ) ) << 5 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 4 ); + ++in; + *out |= ( (*in) % (1U << 21 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 3 ); + ++in; + *out |= ( (*in) % (1U << 21 ) ) << 3 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 2 ); + ++in; + *out |= ( (*in) % (1U << 21 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 1 ); + ++in; + *out |= ( (*in) % (1U << 21 ) ) << 1 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 21 ) ) >> ( 21 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; +} + + + + +void __fastpack22(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 22 ) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) % (1U << 22 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) % (1U << 22 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) % (1U << 22 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) % (1U << 22 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + *out = (*in) % (1U << 22 ) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) % (1U << 22 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) % (1U << 22 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) % (1U << 22 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) % (1U << 22 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 22 ) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; +} + + + + +void __fastpack23(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 23 ) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) % (1U << 23 ) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) % (1U << 23 ) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) % (1U << 23 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) % (1U << 23 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 7 ); + ++in; + *out |= ( (*in) % (1U << 23 ) ) << 7 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 3 ); + ++in; + *out |= ( (*in) % (1U << 23 ) ) << 3 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 8 ); + ++in; + *out |= ( (*in) % (1U << 23 ) ) << 8 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 4 ); + ++in; + *out |= ( (*in) % (1U << 23 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 23 ) ) >> ( 23 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; +} + + + + +void __fastpack24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 24 ) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) % (1U << 24 ) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) % (1U << 24 ) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) % (1U << 24 ) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) % (1U << 24 ) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) % (1U << 24 ) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) % (1U << 24 ) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) % (1U << 24 ) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; +} + + + + +void __fastpack25(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 25 ) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) % (1U << 25 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) % (1U << 25 ) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) % (1U << 25 ) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 2 ); + ++in; + *out |= ( (*in) % (1U << 25 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 6 ); + ++in; + *out |= ( (*in) % (1U << 25 ) ) << 6 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 3 ); + ++in; + *out |= ( (*in) % (1U << 25 ) ) << 3 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 25 ) ) >> ( 25 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; +} + + + + +void __fastpack26(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 26 ) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) % (1U << 26 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) % (1U << 26 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + *out = (*in) % (1U << 26 ) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) % (1U << 26 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) % (1U << 26 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 26 ) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; +} + + + + +void __fastpack27(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 27 ) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) % (1U << 27 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) % (1U << 27 ) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 1 ); + ++in; + *out |= ( (*in) % (1U << 27 ) ) << 1 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 3 ); + ++in; + *out |= ( (*in) % (1U << 27 ) ) << 3 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) % (1U << 27 ) ) >> ( 27 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; +} + + + + +void __fastpack28(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 28 ) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) % (1U << 28 ) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) % (1U << 28 ) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) % (1U << 28 ) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 28 ) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; +} + + + + +void __fastpack29(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 29 ) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) % (1U << 29 ) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 1 ); + ++in; + *out |= ( (*in) % (1U << 29 ) ) << 1 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) % (1U << 29 ) ) >> ( 29 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; +} + + + + +void __fastpack30(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 30 ) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + *out = (*in) % (1U << 30 ) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) % (1U << 30 ) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; +} + + + + +void __fastpack31(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) % (1U << 31 ) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + *out = ( (*in) % (1U << 31 ) ) >> ( 31 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask1(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++in; + *out |= ( (*in) ) << 31 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask2(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask3(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 3 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 29 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask4(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask5(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 5 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 5 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 27 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask6(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask7(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 7 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 7 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 7 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 25 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask9(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 9 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 9 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 9 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 9 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 23 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask10(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask11(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 11 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 11 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 11 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 11 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 11 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 21 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask12(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask13(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 13 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 13 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 13 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 13 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 13 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 13 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 19 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask14(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask15(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 15 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 15 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 15 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 15 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 15 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 15 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 15 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 17 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask17(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 17 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 17 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 17 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 17 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 17 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 17 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 17 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 17 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 17 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask18(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask19(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 19 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 19 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 19 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 19 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 19 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 19 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 19 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 19 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 19 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 19 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask20(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask21(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 21 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 21 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 21 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 21 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 21 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 21 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 21 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 21 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 21 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 21 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 21 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask22(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask23(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 23 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 23 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 23 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 23 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 23 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 23 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 23 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 23 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 23 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 23 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 23 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 23 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask25(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 25 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 25 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 25 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 25 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 25 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 25 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 25 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 25 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 25 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 25 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 25 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 25 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 25 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask26(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask27(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 27 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 27 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 27 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 27 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 27 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 27 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 27 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 27 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 27 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 27 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 27 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 27 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 27 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 27 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask28(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask29(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 29 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 29 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 29 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 29 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 29 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 29 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 29 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 29 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 29 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 29 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 29 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 29 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 29 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 29 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 29 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask30(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; +} + + + + /*assumes that integers fit in the prescribed number of bits */ + void __fastpackwithoutmask31(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 31 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 31 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 31 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 31 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 31 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 31 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 31 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 31 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 31 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 31 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 31 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 31 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 31 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 31 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++out; + *out = ( (*in) ) >> ( 31 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + *out = ( (*in) ) >> ( 31 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; +} + + + +void fastunpack(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch(bit) { + case 0: + __fastunpack0(in,out); + break; + case 1: + __fastunpack1(in,out); + break; + case 2: + __fastunpack2(in,out); + break; + case 3: + __fastunpack3(in,out); + break; + case 4: + __fastunpack4(in,out); + break; + case 5: + __fastunpack5(in,out); + break; + case 6: + __fastunpack6(in,out); + break; + case 7: + __fastunpack7(in,out); + break; + case 8: + __fastunpack8(in,out); + break; + case 9: + __fastunpack9(in,out); + break; + case 10: + __fastunpack10(in,out); + break; + case 11: + __fastunpack11(in,out); + break; + case 12: + __fastunpack12(in,out); + break; + case 13: + __fastunpack13(in,out); + break; + case 14: + __fastunpack14(in,out); + break; + case 15: + __fastunpack15(in,out); + break; + case 16: + __fastunpack16(in,out); + break; + case 17: + __fastunpack17(in,out); + break; + case 18: + __fastunpack18(in,out); + break; + case 19: + __fastunpack19(in,out); + break; + case 20: + __fastunpack20(in,out); + break; + case 21: + __fastunpack21(in,out); + break; + case 22: + __fastunpack22(in,out); + break; + case 23: + __fastunpack23(in,out); + break; + case 24: + __fastunpack24(in,out); + break; + case 25: + __fastunpack25(in,out); + break; + case 26: + __fastunpack26(in,out); + break; + case 27: + __fastunpack27(in,out); + break; + case 28: + __fastunpack28(in,out); + break; + case 29: + __fastunpack29(in,out); + break; + case 30: + __fastunpack30(in,out); + break; + case 31: + __fastunpack31(in,out); + break; + case 32: + __fastunpack32(in,out); + break; + default: + break; + } +} + + + +void fastpack(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch(bit) { + case 0: + __fastpack0(in,out); + break; + case 1: + __fastpack1(in,out); + break; + case 2: + __fastpack2(in,out); + break; + case 3: + __fastpack3(in,out); + break; + case 4: + __fastpack4(in,out); + break; + case 5: + __fastpack5(in,out); + break; + case 6: + __fastpack6(in,out); + break; + case 7: + __fastpack7(in,out); + break; + case 8: + __fastpack8(in,out); + break; + case 9: + __fastpack9(in,out); + break; + case 10: + __fastpack10(in,out); + break; + case 11: + __fastpack11(in,out); + break; + case 12: + __fastpack12(in,out); + break; + case 13: + __fastpack13(in,out); + break; + case 14: + __fastpack14(in,out); + break; + case 15: + __fastpack15(in,out); + break; + case 16: + __fastpack16(in,out); + break; + case 17: + __fastpack17(in,out); + break; + case 18: + __fastpack18(in,out); + break; + case 19: + __fastpack19(in,out); + break; + case 20: + __fastpack20(in,out); + break; + case 21: + __fastpack21(in,out); + break; + case 22: + __fastpack22(in,out); + break; + case 23: + __fastpack23(in,out); + break; + case 24: + __fastpack24(in,out); + break; + case 25: + __fastpack25(in,out); + break; + case 26: + __fastpack26(in,out); + break; + case 27: + __fastpack27(in,out); + break; + case 28: + __fastpack28(in,out); + break; + case 29: + __fastpack29(in,out); + break; + case 30: + __fastpack30(in,out); + break; + case 31: + __fastpack31(in,out); + break; + case 32: + __fastpack32(in,out); + break; + default: + break; + } +} + + + +/*assumes that integers fit in the prescribed number of bits*/ +void fastpackwithoutmask(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch(bit) { + case 0: + __fastpackwithoutmask0(in,out); + break; + case 1: + __fastpackwithoutmask1(in,out); + break; + case 2: + __fastpackwithoutmask2(in,out); + break; + case 3: + __fastpackwithoutmask3(in,out); + break; + case 4: + __fastpackwithoutmask4(in,out); + break; + case 5: + __fastpackwithoutmask5(in,out); + break; + case 6: + __fastpackwithoutmask6(in,out); + break; + case 7: + __fastpackwithoutmask7(in,out); + break; + case 8: + __fastpackwithoutmask8(in,out); + break; + case 9: + __fastpackwithoutmask9(in,out); + break; + case 10: + __fastpackwithoutmask10(in,out); + break; + case 11: + __fastpackwithoutmask11(in,out); + break; + case 12: + __fastpackwithoutmask12(in,out); + break; + case 13: + __fastpackwithoutmask13(in,out); + break; + case 14: + __fastpackwithoutmask14(in,out); + break; + case 15: + __fastpackwithoutmask15(in,out); + break; + case 16: + __fastpackwithoutmask16(in,out); + break; + case 17: + __fastpackwithoutmask17(in,out); + break; + case 18: + __fastpackwithoutmask18(in,out); + break; + case 19: + __fastpackwithoutmask19(in,out); + break; + case 20: + __fastpackwithoutmask20(in,out); + break; + case 21: + __fastpackwithoutmask21(in,out); + break; + case 22: + __fastpackwithoutmask22(in,out); + break; + case 23: + __fastpackwithoutmask23(in,out); + break; + case 24: + __fastpackwithoutmask24(in,out); + break; + case 25: + __fastpackwithoutmask25(in,out); + break; + case 26: + __fastpackwithoutmask26(in,out); + break; + case 27: + __fastpackwithoutmask27(in,out); + break; + case 28: + __fastpackwithoutmask28(in,out); + break; + case 29: + __fastpackwithoutmask29(in,out); + break; + case 30: + __fastpackwithoutmask30(in,out); + break; + case 31: + __fastpackwithoutmask31(in,out); + break; + case 32: + __fastpackwithoutmask32(in,out); + break; + default: + break; + } +} + diff --git a/src/integratedbitpacking.cpp b/src/integratedbitpacking.cpp new file mode 100644 index 0000000..d54b00e --- /dev/null +++ b/src/integratedbitpacking.cpp @@ -0,0 +1,6724 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#include "integratedbitpacking.h" + + +void __integratedfastunpack0(const uint32_t initoffset, const uint32_t * __restrict__ , uint32_t * __restrict__ out) { + for(uint32_t i = 0; i<32;++i) + *(out++) = initoffset; +} +void __integratedfastpack0(const uint32_t, const uint32_t * __restrict__ , uint32_t * __restrict__ ) { +} + + +void __integratedfastunpack32(const uint32_t, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + for(int k = 0 ; k < 32 ;++k) + out[k] = in[k]; // no sense in wasting time with deltas +} + +void __integratedfastpack32(const uint32_t, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + for(int k = 0 ; k < 32 ;++k) + out[k] = in[k] ; // no sense in wasting time with deltas +} + + +void __integratedfastunpack2(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 2 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) % (1U << 2 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack3(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 3 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 3 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 3 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) % (1U << 3 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack5(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 5 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 5 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 5 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 5 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 5 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) % (1U << 5 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack6(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 6 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 6 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 6 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 6 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 6 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 6 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack7(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 7 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 7 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 7 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 7 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 7 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 7 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 7 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 7 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack9(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 9 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 9 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 9 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 9 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 9 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 9 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 9 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 9 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 9 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 9 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack10(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 10 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 10 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 10 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 10 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 10 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 10 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 10 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 10 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 10 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 10 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack11(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 11 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 11 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 11 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 11 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 11 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 11 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 11 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 11 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 11 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 11 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 11 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 11 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack12(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 12 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 12 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 12 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 12 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 12 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 12 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 12 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 12 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 12 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 12 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack13(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 13 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 13 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 13 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 13 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 13 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 13 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 13 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 13 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 13 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 11 ))<<( 13 - 11 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 13 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 13 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 13 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 13 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack14(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 14 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 14 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 14 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 14 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 14 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 14 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 14 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 14 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 14 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 14 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 14 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 14 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 14 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 14 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack15(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 15 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 13 ))<<( 15 - 13 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 11 ))<<( 15 - 11 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 15 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 15 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 15 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 15 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 15 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 15 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 15 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 15 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 15 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 15 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 15 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 15 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 15 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack17(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 17 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 17 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 17 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 17 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 17 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 17 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 17 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 17 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 17 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 17 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 17 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 17 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 17 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 17 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 11 ))<<( 17 - 11 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 13 ))<<( 17 - 13 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) % (1U << 17 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 15 ))<<( 17 - 15 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack18(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 18 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 18 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 18 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 18 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 18 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 18 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 18 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 18 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 18 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 18 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 18 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 18 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 18 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 18 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 18 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 18 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 18 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 18 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack19(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 19 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 19 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 19 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 19 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 19 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 11 ))<<( 19 - 11 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 17 ))<<( 19 - 17 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 19 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 19 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 19 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 19 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 19 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 15 ))<<( 19 - 15 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 19 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 19 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 19 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 19 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 19 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 19 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 13 ))<<( 19 - 13 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack20(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 20 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 20 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 20 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 20 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 20 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 20 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 20 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 20 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 20 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 20 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 20 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 20 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 20 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 20 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 20 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 20 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 20 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 20 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack21(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 21 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 21 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) % (1U << 21 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 21 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 21 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) % (1U << 21 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 19 ))<<( 21 - 19 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 21 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 21 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 21 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 21 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 21 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 17 ))<<( 21 - 17 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 21 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 21 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 21 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 21 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 21 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 15 ))<<( 21 - 15 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 21 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 21 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 21 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 21 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 21 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 13 ))<<( 21 - 13 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 21 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 21 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 21 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 21 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 21 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 11 ))<<( 21 - 11 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack22(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 22 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 22 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 22 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 22 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 22 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 22 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 22 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 22 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 22 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 22 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 22 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 22 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 22 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 22 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 22 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 22 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 22 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 22 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 22 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 22 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 22 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 22 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 22 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 22 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 22 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 22 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 22 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 22 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 22 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 22 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack23(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 23 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 23 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 23 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 23 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 19 ))<<( 23 - 19 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 23 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 23 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 23 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 15 ))<<( 23 - 15 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 23 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 23 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 23 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 11 ))<<( 23 - 11 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 23 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 23 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 23 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 23 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) % (1U << 23 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 21 ))<<( 23 - 21 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 23 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 23 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 23 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 17 ))<<( 23 - 17 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 23 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) % (1U << 23 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 22 ))<<( 23 - 22 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 13 ))<<( 23 - 13 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 23 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 23 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 23 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 23 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack24(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 24 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 24 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 24 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 24 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 24 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 24 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 24 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 24 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 24 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 24 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 24 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 24 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 24 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 24 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 24 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 24 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 24 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 24 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 24 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 24 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 24 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 24 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 24 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 24 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack25(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 25 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 25 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 11 ))<<( 25 - 11 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 25 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 25 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 22 ))<<( 25 - 22 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 15 ))<<( 25 - 15 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 25 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 25 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 25 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 19 ))<<( 25 - 19 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 25 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 25 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) % (1U << 25 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 23 ))<<( 25 - 23 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 25 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 25 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 25 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 25 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 25 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 13 ))<<( 25 - 13 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 25 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) % (1U << 25 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 25 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 17 ))<<( 25 - 17 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 25 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 25 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 25 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 21 ))<<( 25 - 21 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 25 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 25 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack26(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 26 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 26 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 26 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 26 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 26 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 26 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 22 ))<<( 26 - 22 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 26 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 26 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 26 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 26 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 26 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 26 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 26 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 26 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 26 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 26 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 26 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 26 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 26 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 26 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 22 ))<<( 26 - 22 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 26 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 26 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 26 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 26 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 26 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 26 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 26 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 26 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack27(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 27 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 22 ))<<( 27 - 22 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 17 ))<<( 27 - 17 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 27 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 27 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 27 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 27 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 27 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 19 ))<<( 27 - 19 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 27 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 27 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 27 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) % (1U << 27 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 26 ))<<( 27 - 26 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 21 ))<<( 27 - 21 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 27 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 11 ))<<( 27 - 11 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 27 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 27 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 27 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 23 ))<<( 27 - 23 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 27 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 13 ))<<( 27 - 13 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 27 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 27 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) % (1U << 27 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 25 ))<<( 27 - 25 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 27 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 15 ))<<( 27 - 15 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 27 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 27 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack28(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 28 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 28 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 28 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 28 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 28 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 28 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 28 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 28 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 28 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 28 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 28 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 28 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 28 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 28 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 28 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 28 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 28 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 28 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 28 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 28 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 28 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 28 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 28 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 28 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 28 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 28 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 28 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 28 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack29(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 29 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 26 ))<<( 29 - 26 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 23 ))<<( 29 - 23 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 29 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 17 ))<<( 29 - 17 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 29 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 11 ))<<( 29 - 11 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 29 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 29 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 29 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) % (1U << 29 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 28 ))<<( 29 - 28 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 25 ))<<( 29 - 25 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 22 ))<<( 29 - 22 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 19 ))<<( 29 - 19 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 29 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 13 ))<<( 29 - 13 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 29 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 29 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 29 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 29 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) % (1U << 29 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 27 ))<<( 29 - 27 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 29 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 21 ))<<( 29 - 21 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 29 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 15 ))<<( 29 - 15 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 29 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 29 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 29 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 29 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack30(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 30 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 28 ))<<( 30 - 28 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 26 ))<<( 30 - 26 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 30 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 22 ))<<( 30 - 22 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 30 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 30 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 30 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 30 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 30 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 30 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 30 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 30 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 30 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 30 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) ; + ++in; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 0 ) % (1U << 30 ) ; + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 28 ))<<( 30 - 28 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 26 ))<<( 30 - 26 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 30 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 22 ))<<( 30 - 22 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 30 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 30 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 30 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 30 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 30 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 30 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 30 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 30 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 30 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 30 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack31(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in >> 0 ) % (1U << 31 ) ; + *out += initoffset ; // integrated delta decoding + out++; + *out = ( *in >> 31 ) ; + ++in; + *out |= (*in % (1U<< 30 ))<<( 31 - 30 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 30 ) ; + ++in; + *out |= (*in % (1U<< 29 ))<<( 31 - 29 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 29 ) ; + ++in; + *out |= (*in % (1U<< 28 ))<<( 31 - 28 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 28 ) ; + ++in; + *out |= (*in % (1U<< 27 ))<<( 31 - 27 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 27 ) ; + ++in; + *out |= (*in % (1U<< 26 ))<<( 31 - 26 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 26 ) ; + ++in; + *out |= (*in % (1U<< 25 ))<<( 31 - 25 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 25 ) ; + ++in; + *out |= (*in % (1U<< 24 ))<<( 31 - 24 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 24 ) ; + ++in; + *out |= (*in % (1U<< 23 ))<<( 31 - 23 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 23 ) ; + ++in; + *out |= (*in % (1U<< 22 ))<<( 31 - 22 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 22 ) ; + ++in; + *out |= (*in % (1U<< 21 ))<<( 31 - 21 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 21 ) ; + ++in; + *out |= (*in % (1U<< 20 ))<<( 31 - 20 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 20 ) ; + ++in; + *out |= (*in % (1U<< 19 ))<<( 31 - 19 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 19 ) ; + ++in; + *out |= (*in % (1U<< 18 ))<<( 31 - 18 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 18 ) ; + ++in; + *out |= (*in % (1U<< 17 ))<<( 31 - 17 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 17 ) ; + ++in; + *out |= (*in % (1U<< 16 ))<<( 31 - 16 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 16 ) ; + ++in; + *out |= (*in % (1U<< 15 ))<<( 31 - 15 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 15 ) ; + ++in; + *out |= (*in % (1U<< 14 ))<<( 31 - 14 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 14 ) ; + ++in; + *out |= (*in % (1U<< 13 ))<<( 31 - 13 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 13 ) ; + ++in; + *out |= (*in % (1U<< 12 ))<<( 31 - 12 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 12 ) ; + ++in; + *out |= (*in % (1U<< 11 ))<<( 31 - 11 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 11 ) ; + ++in; + *out |= (*in % (1U<< 10 ))<<( 31 - 10 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 10 ) ; + ++in; + *out |= (*in % (1U<< 9 ))<<( 31 - 9 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 9 ) ; + ++in; + *out |= (*in % (1U<< 8 ))<<( 31 - 8 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 8 ) ; + ++in; + *out |= (*in % (1U<< 7 ))<<( 31 - 7 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 7 ) ; + ++in; + *out |= (*in % (1U<< 6 ))<<( 31 - 6 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 6 ) ; + ++in; + *out |= (*in % (1U<< 5 ))<<( 31 - 5 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 5 ) ; + ++in; + *out |= (*in % (1U<< 4 ))<<( 31 - 4 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 4 ) ; + ++in; + *out |= (*in % (1U<< 3 ))<<( 31 - 3 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 3 ) ; + ++in; + *out |= (*in % (1U<< 2 ))<<( 31 - 2 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 2 ) ; + ++in; + *out |= (*in % (1U<< 1 ))<<( 31 - 1 ); + *out += out[-1] ; // integrated delta decoding + out++; + *out = ( *in >> 1 ) ; + *out += out[-1] ; // integrated delta decoding +} + + + + +void __integratedfastunpack1(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = ( *in & 1 ) + initoffset; + ++out; + *out = ( (*in >> 1) & 1) + out[-1]; + ++out; + for( uint32_t i = 2 ; i < 32; i += 1 ) { + *out = ( ( *in >> i) & 1 ) + out[-1]; + ++i; + ++out; + *out = ( ( *in >> i) & 1 ) + out[-1]; + ++out; + } +} + + +void __integratedfastunpack4(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *(out++) = ( *in % (1U << 4 ) ) + initoffset; + for( uint32_t i = 4 ; i < 32; i += 4 ) { + *out = ( ( *in >> i ) % (1U << 4 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 4 ) { + *out = ( ( *in >> i ) % (1U << 4 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 4 ) { + *out = ( ( *in >> i ) % (1U << 4 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 4 ) { + *out = ( ( *in >> i ) % (1U << 4 ) ) + out[-1]; + ++out; + } +} + + + + +void __integratedfastunpack8(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *(out++) = ( *in % (1U << 8 ) ) + initoffset; + for( uint32_t i = 8 ; i < 32; i += 8 ) { + *out = ( ( *in >> i ) % (1U << 8 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 8 ) { + *out = ( ( *in >> i ) % (1U << 8 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 8 ) { + *out = ( ( *in >> i ) % (1U << 8 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 8 ) { + *out = ( ( *in >> i ) % (1U << 8 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 8 ) { + *out = ( ( *in >> i ) % (1U << 8 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 8 ) { + *out = ( ( *in >> i ) % (1U << 8 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 8 ) { + *out = ( ( *in >> i ) % (1U << 8 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 8 ) { + *out = ( ( *in >> i ) % (1U << 8 ) ) + out[-1]; + ++out; + } +} + + + + +void __integratedfastunpack16(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *(out++) = ( *in % (1U << 16 ) ) + initoffset; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } + ++in; + for( uint32_t i = 0; i < 32; i += 16 ) { + *out = ( ( *in >> i ) % (1U << 16 ) ) + out[-1]; + ++out; + } +} + + + + + void __integratedfastpack2(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 2 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 22 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 24 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 26 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 28 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 2 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 22 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 24 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 26 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 2 ) ) << 28 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; +} + + + + + void __integratedfastpack3(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 3 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 9 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 15 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 21 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 24 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 27 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 3 ) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 13 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 19 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 22 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 25 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 28 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 3 ) ) >> ( 3 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 11 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 17 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 23 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 3 ) ) << 26 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; +} + + + + + void __integratedfastpack5(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 5 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 15 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 25 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 5 ) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 13 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 23 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 5 ) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 11 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 21 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 26 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 5 ) ) >> ( 5 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 9 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 19 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 24 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 5 ) ) >> ( 5 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 17 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 5 ) ) << 22 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; +} + + + + + void __integratedfastpack6(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 6 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 24 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 6 ) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 22 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 6 ) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 6 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 24 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 6 ) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 22 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 6 ) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 6 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; +} + + + + + void __integratedfastpack7(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 7 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 21 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 7 ) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 17 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 24 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 7 ) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 13 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 7 ) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 9 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 23 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 7 ) ) >> ( 7 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 19 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 7 ) ) >> ( 7 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 15 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 22 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 7 ) ) >> ( 7 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 11 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 7 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; +} + + + + + void __integratedfastpack9(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 9 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 9 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 9 ) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 13 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 22 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 9 ) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 17 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 9 ) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 21 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 9 ) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 9 ) ) >> ( 9 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 11 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 9 ) ) >> ( 9 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 15 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 9 ) ) >> ( 9 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 19 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 9 ) ) >> ( 9 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 9 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; +} + + + + + void __integratedfastpack10(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 10 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 10 ) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 10 ) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 10 ) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 10 ) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 10 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 10 ) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 10 ) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 10 ) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 10 ) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 10 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; +} + + + + + void __integratedfastpack11(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 11 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 11 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 11 ) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 11 ) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 13 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 11 ) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 11 ) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 15 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 11 ) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 11 ) ) >> ( 11 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 17 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 11 ) ) >> ( 11 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 11 ) ) >> ( 11 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 19 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 11 ) ) >> ( 11 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 9 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 20 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 11 ) ) >> ( 11 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 11 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; +} + + + + + void __integratedfastpack12(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 12 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 12 ) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 12 ) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 12 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 12 ) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 12 ) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 12 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 12 ) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 12 ) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 12 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 12 ) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 12 ) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 12 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; +} + + + + + void __integratedfastpack13(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 13 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 13 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 15 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 9 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 17 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 11 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 11 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 18 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 13 ) ) >> ( 13 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 13 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 19 ; +} + + + + + void __integratedfastpack14(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 14 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 14 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 14 ) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 14 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; +} + + + + + void __integratedfastpack15(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 15 ) ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 15 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 13 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 11 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 9 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 16 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 19 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 15 ) ) >> ( 15 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 15 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 17 ; +} + + + + + void __integratedfastpack17(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 17 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 17 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 19 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 14 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 9 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 11 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 11 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 13 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 17 ) ) << 13 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 17 ) ) >> ( 17 - 15 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 15 ; +} + + + + + void __integratedfastpack18(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 18 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 18 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 18 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 18 ) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; +} + + + + + void __integratedfastpack19(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 19 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 19 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 12 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 11 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 17 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 9 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 15 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 15 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 19 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 19 ) ) >> ( 19 - 13 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 13 ; +} + + + + + void __integratedfastpack20(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 20 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 20 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 20 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 20 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 20 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 20 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 20 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 20 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 20 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 20 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 20 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 20 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 20 ) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; +} + + + + + void __integratedfastpack21(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 21 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 21 ) ) << 10 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 21 ) ) << 9 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 19 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 21 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 21 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 17 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 21 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 21 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 15 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 15 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 21 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 21 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 13 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 13 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 21 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 21 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 21 ) ) >> ( 21 - 11 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 11 ; +} + + + + + void __integratedfastpack22(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 22 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 22 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 22 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 22 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 22 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 22 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 22 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 22 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 22 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 22 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 22 ) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; +} + + + + + void __integratedfastpack23(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 23 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 23 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 19 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 23 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 15 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 23 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 11 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 23 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 23 ) ) << 7 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 21 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 23 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 17 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 17 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 23 ) ) << 8 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 22 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 13 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 13 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 23 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 23 ) ) >> ( 23 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 9 ; +} + + + + + void __integratedfastpack24(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 24 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 24 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 24 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 24 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 24 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 24 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 24 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 24 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 24 ) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; +} + + + + + void __integratedfastpack25(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 25 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 11 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 25 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 15 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 25 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 19 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 25 ) ) << 5 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 9 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 25 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 13 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 13 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 25 ) ) << 6 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 17 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 17 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 25 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 21 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 25 ) ) >> ( 25 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 7 ; +} + + + + + void __integratedfastpack26(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 26 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 26 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 26 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 6 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 26 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 26 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 26 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 26 ) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 6 ; +} + + + + + void __integratedfastpack27(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 27 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 17 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 7 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 27 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 19 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 9 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 27 ) ) << 4 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 11 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 11 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 6 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 27 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 23 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 13 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 13 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 27 ) ) << 3 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 25 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 15 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 15 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 27 ) ) >> ( 27 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 5 ; +} + + + + + void __integratedfastpack28(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 28 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 4 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 28 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 4 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 28 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 4 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 28 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 28 ) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 4 ; +} + + + + + void __integratedfastpack29(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 29 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 17 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 11 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 5 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 29 ) ) << 2 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 19 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 13 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 13 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 7 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 4 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) % (1U << 29 ) ) << 1 ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 27 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 21 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 15 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 15 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 9 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 6 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 29 ) ) >> ( 29 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 3 ; +} + + + + + void __integratedfastpack30(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 30 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 6 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 4 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 2 ; + ++out; + ++in; + *out = (*in - in[-1] ) % (1U << 30 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 6 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 4 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 30 ) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 2 ; +} + + + + + void __integratedfastpack31(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = (*in - initoffset ) % (1U << 31 ) ; + ++in; + *out |= ( (*in - in[-1] ) ) << 31 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 30 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 29 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 28 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 27 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 26 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 25 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 24 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 23 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 22 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 21 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 20 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 19 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 18 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 17 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 16 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 16 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 15 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 15 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 14 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 14 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 13 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 13 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 12 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 12 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 11 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 11 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 10 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 10 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 9 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 9 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 8 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 8 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 7 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 7 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 6 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 6 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 5 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 5 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 4 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 4 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 3 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 3 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 2 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 2 ; + ++out; + *out = ( (*in - in[-1] ) % (1U << 31 ) ) >> ( 31 - 1 ); + ++in; + *out |= ( (*in - in[-1] ) ) << 1 ; +} + + + + /*assumes that integers fit in the prescribed number of bits*/ + void __integratedfastpack1(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = *(in++) - initoffset ; + for( uint32_t i = 1 ; i < 32; i += 1 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ + void __integratedfastpack4(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = *(in++) - initoffset ; + for( uint32_t i = 4 ; i < 32; i += 4 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 4 ; i < 32; i += 4 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 4 ; i < 32; i += 4 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 4 ; i < 32; i += 4 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; +} + + + + /*assumes that integers fit in the prescribed number of bits*/ + void __integratedfastpack8(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = *(in++) - initoffset ; + for( uint32_t i = 8 ; i < 32; i += 8 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 8 ; i < 32; i += 8 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 8 ; i < 32; i += 8 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 8 ; i < 32; i += 8 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 8 ; i < 32; i += 8 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 8 ; i < 32; i += 8 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 8 ; i < 32; i += 8 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 8 ; i < 32; i += 8 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; +} + + + + /*assumes that integers fit in the prescribed number of bits*/ + void __integratedfastpack16(const uint32_t initoffset, const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out = *(in++) - initoffset ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; + *out = *in - in[-1] ; + ++in ; + for( uint32_t i = 16 ; i < 32; i += 16 ) { + *out |= ( *in - in[-1] ) << i ; + ++in ; + } + ++out; +} + + diff --git a/src/intersection.cpp b/src/intersection.cpp new file mode 100644 index 0000000..4dfd640 --- /dev/null +++ b/src/intersection.cpp @@ -0,0 +1,617 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ + +#include "intersection.h" + + +/** + * This is often called galloping or exponential search. + * + * Used by frogintersectioncardinality below + * + * Based on binary search... + * Find the smallest integer larger than pos such + * that array[pos]>= min. + * If none can be found, return array.length. + * From code by O. Kaser. + */ +static size_t __frogadvanceUntil(const uint32_t * array, const size_t pos, + const size_t length, const size_t min) { + size_t lower = pos + 1; + + // special handling for a possibly common sequential case + if ((lower >= length) or (array[lower] >= min)) { + return lower; + } + + size_t spansize = 1; // could set larger + // bootstrap an upper limit + + while ((lower + spansize < length) and (array[lower + spansize] < min)) + spansize *= 2; + size_t upper = (lower + spansize < length) ? lower + spansize : length - 1; + + // maybe we are lucky (could be common case when the seek ahead expected to be small and sequential will otherwise make us look bad) + //if (array[upper] == min) { + // return upper; + //} + + if (array[upper] < min) {// means array has no item >= min + return length; + } + + // we know that the next-smallest span was too small + lower += (spansize / 2); + + // else begin binary search + size_t mid = 0; + while (lower + 1 != upper) { + mid = (lower + upper) / 2; + if (array[mid] == min) { + return mid; + } else if (array[mid] < min) + lower = mid; + else + upper = mid; + } + return upper; + +} + + +size_t onesidedgallopingintersection(const uint32_t * smallset, + const size_t smalllength, const uint32_t * largeset, + const size_t largelength, uint32_t * out) { + if(largelength < smalllength) return onesidedgallopingintersection(largeset,largelength,smallset,smalllength,out); + if (0 == smalllength) + return 0; + const uint32_t * const initout(out); + size_t k1 = 0, k2 = 0; + while (true) { + if (largeset[k1] < smallset[k2]) { + k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]); + if (k1 == largelength) + break; + } + midpoint: if (smallset[k2] < largeset[k1]) { + ++k2; + if (k2 == smalllength) + break; + } else { + *out++ = smallset[k2]; + ++k2; + if (k2 == smalllength) + break; + k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]); + if (k1 == largelength) + break; + goto midpoint; + } + } + return out - initout; + +} + + + +/** + * Fast scalar scheme designed by N. Kurz. + */ +size_t nate_scalar(const uint32_t *A, const size_t lenA, + const uint32_t *B, const size_t lenB, uint32_t * out) { + const uint32_t * const initout(out); + if (lenA == 0 || lenB == 0) + return 0; + + const uint32_t *endA = A + lenA; + const uint32_t *endB = B + lenB; + + while (1) { + while (*A < *B) { + SKIP_FIRST_COMPARE: if (++A == endA) + return (out - initout); + } + while (*A > *B) { + if (++B == endB) + return (out - initout); + } + if (*A == *B) { + *out++ = *A; + if (++A == endA || ++B == endB) + return (out - initout); + } else { + goto SKIP_FIRST_COMPARE; + } + } + + return (out - initout); // NOTREACHED +} + + + + + +size_t match_scalar(const uint32_t *A, const size_t lenA, + const uint32_t *B, const size_t lenB, + uint32_t *out) { + + const uint32_t *initout = out; + if (lenA == 0 || lenB == 0) return 0; + + const uint32_t *endA = A + lenA; + const uint32_t *endB = B + lenB; + + while (1) { + while (*A < *B) { + SKIP_FIRST_COMPARE: + if (++A == endA) goto FINISH; + } + while (*A > *B) { + if (++B == endB) goto FINISH; + } + if (*A == *B) { + *out++ = *A; + if (++A == endA || ++B == endB) goto FINISH; + } else { + goto SKIP_FIRST_COMPARE; + } + } + + FINISH: + return (out - initout); +} +#define VEC_T __m128i + +#define VEC_COPY_LOW(reg_dest, xmm_src) \ + __asm volatile("movd %1, %0" : "=r" (reg_dest) : "x" (xmm_src)) + +#define VEC_OR(dest, other) \ + __asm volatile("por %1, %0" : "+x" (dest) : "x" (other) ) + +// // decltype is C++ and typeof is C +#define VEC_ADD_PTEST(var, add, xmm) { \ + decltype(var) _new = var + add; \ + __asm volatile("ptest %2, %2\n\t" \ + "cmovnz %1, %0\n\t" \ + : /* writes */ "+r" (var) \ + : /* reads */ "r" (_new), "x" (xmm) \ + : /* clobbers */ "cc"); \ + } + +#define VEC_CMP_GREATER(dest, other) \ + __asm volatile("pcmpgtd %1, %0" : "+x" (dest) : "x" (other)) + +#define VEC_CMP_EQUAL(dest, other) \ + __asm volatile("pcmpeqd %1, %0" : "+x" (dest) : "x" (other)) + +#define VEC_SET_ALL_TO_INT(reg, int32) \ + __asm volatile("movd %1, %0; pshufd $0, %0, %0" \ + : "=x" (reg) : "g" (int32) ) + +#define VEC_LOAD_OFFSET(xmm, ptr, bytes) \ + __asm volatile("movdqu %c2(%1), %0" : "=x" (xmm) : \ + "r" (ptr), "i" (bytes)) + +#define COMPILER_LIKELY(x) __builtin_expect((x),1) +#define COMPILER_RARELY(x) __builtin_expect((x),0) + +#define ASM_LEA_ADD_BYTES(ptr, bytes) \ + __asm volatile("lea %c1(%0), %0\n\t" : \ + /* reads/writes %0 */ "+r" (ptr) : \ + /* reads */ "i" (bytes)); + +/** + * Intersections scheme designed by N. Kurz that works very + * well when intersecting an array with another where the density + * differential is small (between 2 to 10). + * + * Note that this is not symmetric: flipping the rare and freq pointers + * as well as lenRare and lenFreq could lead to significant performance + * differences. + * + */ +size_t match_v4_f2_p0 +(const uint32_t *rare, size_t lenRare, + const uint32_t *freq, size_t lenFreq, + uint32_t *matchOut) { + assert(lenRare <= lenFreq); + const uint32_t *matchOrig = matchOut; + if (lenFreq == 0 || lenRare == 0) return 0; + + const uint64_t kFreqSpace = 2 * 4 * (0 + 1) - 1; + const uint64_t kRareSpace = 0; + + const uint32_t *stopFreq = &freq[lenFreq] - kFreqSpace; + const uint32_t *stopRare = &rare[lenRare] - kRareSpace; + + VEC_T Rare; + + VEC_T F0, F1; + + if (COMPILER_RARELY(rare >= stopRare)) goto FINISH_SCALAR; + uint64_t valRare; + valRare = rare[0]; + VEC_SET_ALL_TO_INT(Rare, valRare); + + uint64_t maxFreq; + maxFreq = freq[2 * 4 - 1]; + VEC_LOAD_OFFSET(F0, freq, 0 * sizeof(VEC_T)) ; + VEC_LOAD_OFFSET(F1, freq, 1 * sizeof(VEC_T)); + + if (COMPILER_RARELY(maxFreq < valRare)) goto ADVANCE_FREQ; + +ADVANCE_RARE: + do { + *matchOut = static_cast(valRare); + valRare = rare[1]; // for next iteration + ASM_LEA_ADD_BYTES(rare, sizeof(*rare)); // rare += 1; + + if (COMPILER_RARELY(rare >= stopRare)) { + rare -= 1; + goto FINISH_SCALAR; + } + + VEC_CMP_EQUAL(F0, Rare) ; + VEC_CMP_EQUAL(F1, Rare); + + VEC_SET_ALL_TO_INT(Rare, valRare); + + VEC_OR(F0, F1); + + VEC_ADD_PTEST(matchOut, 1, F0); + + VEC_LOAD_OFFSET(F0, freq, 0 * sizeof(VEC_T)) ; + VEC_LOAD_OFFSET(F1, freq, 1 * sizeof(VEC_T)); + + } while (maxFreq >= valRare); + + uint64_t maxProbe; + +ADVANCE_FREQ: + do { + const uint64_t kProbe = (0 + 1) * 2 * 4; + const uint32_t *probeFreq = freq + kProbe; + maxProbe = freq[(0 + 2) * 2 * 4 - 1]; + + if (COMPILER_RARELY(probeFreq >= stopFreq)) { + goto FINISH_SCALAR; + } + + freq = probeFreq; + + } while (maxProbe < valRare); + + maxFreq = maxProbe; + + VEC_LOAD_OFFSET(F0, freq, 0 * sizeof(VEC_T)) ; + VEC_LOAD_OFFSET(F1, freq, 1 * sizeof(VEC_T)); + + goto ADVANCE_RARE; + + size_t count; +FINISH_SCALAR: + count = matchOut - matchOrig; + + lenFreq = stopFreq + kFreqSpace - freq; + lenRare = stopRare + kRareSpace - rare; + + size_t tail = match_scalar(freq, lenFreq, rare, lenRare, matchOut); + + return count + tail; +} + +size_t danfarfar_medium(const uint32_t *rare, const size_t lenRare, + const uint32_t *freq, const size_t lenFreq, uint32_t * out) { + if (lenFreq == 0 || lenRare == 0) + return 0; + assert(lenRare <= lenFreq); + const uint32_t * const initout (out); + typedef __m128i vec; + const uint32_t veclen = sizeof(vec) / sizeof(uint32_t); + const size_t vecmax = veclen - 1; + const size_t freqspace = 32 * veclen; + const size_t rarespace = 1; + + const uint32_t *stopFreq = freq + lenFreq - freqspace; + const uint32_t *stopRare = rare + lenRare - rarespace; + if (freq > stopFreq) { + return nate_scalar(freq, lenFreq, rare, lenRare, out); + } + while (freq[veclen * 31 + vecmax] < *rare) { + freq += veclen * 32; + if (freq > stopFreq) + goto FINISH_SCALAR; + } + for (; rare < stopRare; ++rare) { + const uint32_t matchRare = *rare;//nextRare; + const vec Match = _mm_set1_epi32(matchRare); + while (freq[veclen * 31 + vecmax] < matchRare) { // if no match possible + freq += veclen * 32; // advance 32 vectors + if (freq > stopFreq) + goto FINISH_SCALAR; + } + vec Q0,Q1,Q2,Q3; + if(freq[veclen * 15 + vecmax] >= matchRare ) { + if(freq[veclen * 7 + vecmax] < matchRare ) { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 8), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 9), Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 10), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 11), Match)); + + Q2 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 12), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 13), Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 14), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 15), Match)); + } else { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 4), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 5), Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 6), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 7), Match)); + Q2 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 0), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 1), Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 2), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 3), Match)); + } + } + else + { + if(freq[veclen * 23 + vecmax] < matchRare ) { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 8 + 16), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 9 + 16), Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 10+ 16), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 11+ 16), Match)); + + Q2 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 12+ 16), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 13+ 16), Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 14+ 16), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 15+ 16), Match)); + } else { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 4+ 16), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 5+ 16), Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 6+ 16), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 7+ 16), Match)); + Q2 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 0+ 16), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 1+ 16), Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 2+ 16), Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 3+ 16), Match)); + } + + } + const vec F0 = _mm_or_si128(_mm_or_si128(Q0, Q1),_mm_or_si128(Q2, Q3)); + if (_mm_testz_si128(F0, F0)) { + } else { + *out++ = matchRare; + } + } + + FINISH_SCALAR: return (out - initout) + nate_scalar(freq, + stopFreq + freqspace - freq, rare, stopRare + rarespace - rare, out); +} + + +size_t SIMDgalloping(const uint32_t *rare, const size_t lenRare, + const uint32_t *freq, const size_t lenFreq, uint32_t * out) { + if (lenFreq == 0 || lenRare == 0) + return 0; + assert(lenRare <= lenFreq); + const uint32_t * const initout (out); + typedef __m128i vec; + const uint32_t veclen = sizeof(vec) / sizeof(uint32_t); + const size_t vecmax = veclen - 1; + const size_t freqspace = 32 * veclen; + const size_t rarespace = 1; + + const uint32_t *stopFreq = freq + lenFreq - freqspace; + const uint32_t *stopRare = rare + lenRare - rarespace; + if (freq > stopFreq) { + return nate_scalar(freq, lenFreq, rare, lenRare, out); + } + for (; rare < stopRare; ++rare) { + const uint32_t matchRare = *rare;//nextRare; + const vec Match = _mm_set1_epi32(matchRare); + + if (freq[veclen * 31 + vecmax] < matchRare) { // if no match possible + uint32_t offset = 1; + if (freq + veclen * 32 > stopFreq) { + freq += veclen * 32; + goto FINISH_SCALAR; + } + while (freq[veclen * offset * 32 + veclen * 31 + vecmax] + < matchRare) { // if no match possible + if (freq + veclen * (2 * offset ) * 32 <= stopFreq) { + offset *= 2; + } else if (freq + veclen * (offset + 1) * 32 <= stopFreq) { + offset = static_cast((stopFreq - freq ) / (veclen * 32)); + //offset += 1; + if (freq[veclen * offset * 32 + veclen * 31 + vecmax] + < matchRare) { + freq += veclen * offset * 32; + goto FINISH_SCALAR; + } else { + break; + } + } else { + freq += veclen * offset * 32; + goto FINISH_SCALAR; + } + } + uint32_t lower = offset / 2; + while (lower + 1 != offset) { + const uint32_t mid = (lower + offset) / 2; + if (freq[veclen * mid * 32 + veclen * 31 + vecmax] + < matchRare) + lower = mid; + else + offset = mid; + } + freq += veclen * offset * 32; + } + vec Q0,Q1,Q2,Q3; + if (freq[veclen * 15 + vecmax] >= matchRare) { + if (freq[veclen * 7 + vecmax] < matchRare) { + Q0 + = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_load_si128((vec *) freq + 8), Match), + _mm_cmpeq_epi32( + _mm_load_si128((vec *) freq + 9), Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 10), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 11), + Match)); + + Q2 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 12), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 13), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 14), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 15), + Match)); + } else { + Q0 + = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_load_si128((vec *) freq + 4), Match), + _mm_cmpeq_epi32( + _mm_load_si128((vec *) freq + 5), Match)); + Q1 + = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_load_si128((vec *) freq + 6), Match), + _mm_cmpeq_epi32( + _mm_load_si128((vec *) freq + 7), Match)); + Q2 + = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_load_si128((vec *) freq + 0), Match), + _mm_cmpeq_epi32( + _mm_load_si128((vec *) freq + 1), Match)); + Q3 + = _mm_or_si128( + _mm_cmpeq_epi32( + _mm_load_si128((vec *) freq + 2), Match), + _mm_cmpeq_epi32( + _mm_load_si128((vec *) freq + 3), Match)); + } + } else { + if (freq[veclen * 23 + vecmax] < matchRare) { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 8 + 16), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 9 + 16), + Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 10 + 16), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 11 + 16), + Match)); + + Q2 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 12 + 16), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 13 + 16), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 14 + 16), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 15 + 16), + Match)); + } else { + Q0 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 4 + 16), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 5 + 16), + Match)); + Q1 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 6 + 16), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 7 + 16), + Match)); + Q2 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 0 + 16), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 1 + 16), + Match)); + Q3 = _mm_or_si128( + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 2 + 16), + Match), + _mm_cmpeq_epi32(_mm_load_si128((vec *) freq + 3 + 16), + Match)); + } + + } + const vec F0 = _mm_or_si128(_mm_or_si128(Q0, Q1),_mm_or_si128(Q2, Q3)); + if (_mm_testz_si128(F0, F0)) { + } else { + *out++ = matchRare; + } + } + + FINISH_SCALAR: return (out - initout) + nate_scalar(freq, + stopFreq + freqspace - freq, rare, stopRare + rarespace - rare, out); +} + +/** + * Our main heuristic. + */ +size_t SIMDintersection(const uint32_t * set1, + const size_t length1, const uint32_t * set2, const size_t length2, uint32_t *out) { + if ((length1==0) or (length2 == 0)) return 0; + if ((1000 * length1 <= length2) or (1000 * length2 <= length1)) { + if (length1 < length2) + return SIMDgalloping(set1, length1, set2, length2,out); + else + return SIMDgalloping(set2, length2, set1, length1,out); + } + + if ((50 * length1 <= length2) or (50 * length2 <= length1)) { + if (length1 < length2) + return danfarfar_medium(set1, length1, set2, length2,out); + else + return danfarfar_medium(set2, length2, set1, length1,out); + } + if (length1 < length2) + return match_v4_f2_p0(set1, length1, set2, length2, out); + else + return match_v4_f2_p0(set2, length2, set1, length1, out); +} + +inline std::map initializeintersectionfactory() { + std::map schemes; + schemes[ "simd" ] = SIMDintersection; + schemes[ "galloping" ] = onesidedgallopingintersection; + schemes[ "scalar" ] = nate_scalar; + schemes[ "v1" ] = match_v4_f2_p0; + schemes["v3"] = danfarfar_medium; + schemes["simdgalloping"] =SIMDgalloping; + + return schemes; +} + +std::map IntersectionFactory::intersection_schemes = initializeintersectionfactory(); + + + diff --git a/src/simdbitpacking.cpp b/src/simdbitpacking.cpp new file mode 100644 index 0000000..8598ae6 --- /dev/null +++ b/src/simdbitpacking.cpp @@ -0,0 +1,13792 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire + */ +#include "simdbitpacking.h" + +using namespace std; + + void __SIMD_fastpackwithoutmask0(const uint32_t * __restrict__ , __m128i * __restrict__ ) {} +void __SIMD_fastpack0(const uint32_t * __restrict__ , __m128i * __restrict__ ) {} +/** + * Rest of the code is borrowed from the fastpfor project (file of the same name) + * with the remove of a few functions at the end and removal of the prefix. Also + * functions are no longer static. + */ + +void __SIMD_fastpackwithoutmask1(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask2(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask3(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask5(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask6(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask7(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask9(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask10(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask11(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask12(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask13(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask14(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask15(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask17(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask18(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask19(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask20(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask21(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask22(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask23(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask24(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask25(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask26(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask27(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask28(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask29(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask30(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask31(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask32(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpackwithoutmask4(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; + + for(uint32_t outer=0; outer< 4 ;++outer) { + InReg = _mm_load_si128(in); + OutReg = InReg; + + InReg = _mm_load_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_load_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_load_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_load_si128(in+4); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_load_si128(in+5); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_load_si128(in+6); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_load_si128(in+7); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +void __SIMD_fastpackwithoutmask8(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; + + for(uint32_t outer=0; outer< 8 ;++outer) { + InReg = _mm_load_si128(in); + OutReg = InReg; + + InReg = _mm_load_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_load_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_load_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +void __SIMD_fastpackwithoutmask16(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; + + for(uint32_t outer=0; outer< 16 ;++outer) { + InReg = _mm_load_si128(in); + OutReg = InReg; + + InReg = _mm_load_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + +void __SIMD_fastpack1(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<1)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack2(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack3(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack5(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack6(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack7(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack9(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack10(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack11(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack12(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack13(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack14(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack15(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack17(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack18(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack19(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack20(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack21(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack22(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack23(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack24(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack25(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack26(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack27(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack28(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack29(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack30(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack31(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_and_si128(_mm_load_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack32(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i InReg = _mm_load_si128(in); + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + ++out; + InReg = _mm_load_si128(++in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + +} + + + +void __SIMD_fastpack4(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + + for(uint32_t outer=0; outer< 4 ;++outer) { + InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_load_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_and_si128(_mm_load_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_load_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_and_si128(_mm_load_si128(in+4), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_load_si128(in+5), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_and_si128(_mm_load_si128(in+6), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_and_si128(_mm_load_si128(in+7), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +void __SIMD_fastpack8(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + + for(uint32_t outer=0; outer< 8 ;++outer) { + InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_load_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_load_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_load_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +void __SIMD_fastpack16(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + + for(uint32_t outer=0; outer< 16 ;++outer) { + InReg = _mm_and_si128(_mm_load_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_load_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + + +void __SIMD_fastunpack1(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg1 = _mm_load_si128(in); + __m128i InReg2 = InReg1; + __m128i OutReg1, OutReg2, OutReg3, OutReg4; + const __m128i mask = _mm_set1_epi32(1); + + unsigned shift = 0; + + for (unsigned i = 0; i < 8; ++i) { + OutReg1 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg2 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + OutReg3 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg4 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + } +} + + + + +void __SIMD_fastunpack2(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack3(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,27) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack4(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack5(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack6(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack7(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack8(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack9(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack10(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack11(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack12(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack13(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack14(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack15(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack16(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack17(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack18(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack19(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack20(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack21(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack22(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack23(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack24(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack25(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack26(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack27(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack28(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack29(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack30(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_load_si128(++in); + + _mm_store_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack31(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_load_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + _mm_store_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,1) ; + _mm_store_si128(out++, OutReg); + + +} + + + + +void __SIMD_fastunpack32(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + __m128i* out = reinterpret_cast<__m128i*>(_out); + for(uint32_t outer=0; outer< 32 ;++outer) { + _mm_store_si128(out++, _mm_load_si128(in++)); + } +} + + diff --git a/src/simdintegratedbitpacking.cpp b/src/simdintegratedbitpacking.cpp new file mode 100644 index 0000000..08d486d --- /dev/null +++ b/src/simdintegratedbitpacking.cpp @@ -0,0 +1,25287 @@ + +/** +* This code is released under the +* Apache License Version 2.0 http://www.apache.org/licenses/. +* +*/ +#include "simdintegratedbitpacking.h" + + + +template +__m128i iunpack0(__m128i initOffset, const __m128i * , uint32_t * _out) { + __m128i *out = reinterpret_cast<__m128i*>(_out); + static const __m128i zero = _mm_set1_epi32 (0); + + for (unsigned i = 0; i < 8; ++i) { + initOffset = DeltaHelper::PrefixSum(zero, initOffset); + _mm_store_si128(out++, initOffset); + initOffset = DeltaHelper::PrefixSum(zero, initOffset); + _mm_store_si128(out++, initOffset); + initOffset = DeltaHelper::PrefixSum(zero, initOffset); + _mm_store_si128(out++, initOffset); + initOffset = DeltaHelper::PrefixSum(zero, initOffset); + _mm_store_si128(out++, initOffset); + } + + return initOffset; +} + + + +template +void ipackwithoutmask0(__m128i , const uint32_t * , __m128i * ) { + +} + +template +void ipack0(__m128i , const uint32_t * , __m128i * ) { +} + + +template +void ipackwithoutmask1(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack1(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(1U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask2(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack2(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(3U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask3(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack3(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(7U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask4(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack4(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(15U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask5(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack5(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(31U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask6(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack6(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(63U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask7(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack7(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(127U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask8(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack8(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(255U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask9(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack9(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(511U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask10(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack10(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(1023U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask11(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack11(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(2047U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask12(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack12(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(4095U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask13(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack13(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(8191U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask14(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack14(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(16383U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask15(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack15(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(32767U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask16(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack16(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(65535U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask17(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack17(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(131071U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask18(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack18(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(262143U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask19(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack19(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(524287U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask20(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack20(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(1048575U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask21(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack21(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(2097151U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask22(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack22(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(4194303U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask23(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack23(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(8388607U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask24(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack24(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(16777215U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask25(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack25(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(33554431U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask26(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack26(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(67108863U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask27(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack27(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(134217727U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask28(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack28(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(268435455U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask29(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack29(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(536870911U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask30(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack30(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(1073741823U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask31(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = DeltaHelper::Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack31(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const static __m128i mask = _mm_set1_epi32(2147483647U); ; + + __m128i CurrIn = _mm_load_si128(in); + __m128i InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_store_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_load_si128(in); + InReg = _mm_and_si128(DeltaHelper::Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipackwithoutmask32(__m128i /* initOffset */ , const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + __m128i InReg = _mm_load_si128(in); + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + +} + + + +template +void ipack32(__m128i /* initOffset */ , const uint32_t * _in, __m128i * out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + + __m128i InReg = _mm_load_si128(in); + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_load_si128(in); + + OutReg = InReg; + _mm_store_si128(out, OutReg); + + +} + + + + +template +__m128i iunpack1(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<1)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack2(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack3(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack4(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack5(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack6(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack7(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack8(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack9(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack10(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack11(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack12(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack13(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack14(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack15(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack16(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack17(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack18(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack19(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack20(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack21(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack22(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack23(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack24(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack25(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack26(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack27(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack28(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack29(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack30(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + + +template +__m128i iunpack31(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_load_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_load_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = tmp; + OutReg = DeltaHelper::PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_store_si128(out++, OutReg); + + + return initOffset; + +} + + + +template +__m128i iunpack32(__m128i , const __m128i* in, uint32_t * _out) { + __m128i * mout = reinterpret_cast<__m128i *>(_out); + __m128i invec; + for(size_t k = 0; k < 128/4; ++k) { + invec = _mm_load_si128(in++); + _mm_store_si128(mout++, invec); + } + return invec; + //memcpy(_out,in,128*4); + //return _mm_load_si128(in+31); +} + + +template __m128i iunpack0(__m128i, const __m128i *, uint32_t *); +template void ipack0(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack1(__m128i, const __m128i *, uint32_t *); +template void ipack1(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask1(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack2(__m128i, const __m128i *, uint32_t *); +template void ipack2(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask2(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack3(__m128i, const __m128i *, uint32_t *); +template void ipack3(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask3(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack4(__m128i, const __m128i *, uint32_t *); +template void ipack4(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask4(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack5(__m128i, const __m128i *, uint32_t *); +template void ipack5(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask5(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack6(__m128i, const __m128i *, uint32_t *); +template void ipack6(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask6(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack7(__m128i, const __m128i *, uint32_t *); +template void ipack7(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask7(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack8(__m128i, const __m128i *, uint32_t *); +template void ipack8(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask8(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack9(__m128i, const __m128i *, uint32_t *); +template void ipack9(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask9(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack10(__m128i, const __m128i *, uint32_t *); +template void ipack10(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask10(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack11(__m128i, const __m128i *, uint32_t *); +template void ipack11(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask11(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack12(__m128i, const __m128i *, uint32_t *); +template void ipack12(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask12(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack13(__m128i, const __m128i *, uint32_t *); +template void ipack13(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask13(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack14(__m128i, const __m128i *, uint32_t *); +template void ipack14(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask14(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack15(__m128i, const __m128i *, uint32_t *); +template void ipack15(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask15(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack16(__m128i, const __m128i *, uint32_t *); +template void ipack16(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask16(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack17(__m128i, const __m128i *, uint32_t *); +template void ipack17(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask17(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack18(__m128i, const __m128i *, uint32_t *); +template void ipack18(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask18(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack19(__m128i, const __m128i *, uint32_t *); +template void ipack19(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask19(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack20(__m128i, const __m128i *, uint32_t *); +template void ipack20(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask20(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack21(__m128i, const __m128i *, uint32_t *); +template void ipack21(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask21(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack22(__m128i, const __m128i *, uint32_t *); +template void ipack22(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask22(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack23(__m128i, const __m128i *, uint32_t *); +template void ipack23(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask23(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack24(__m128i, const __m128i *, uint32_t *); +template void ipack24(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask24(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack25(__m128i, const __m128i *, uint32_t *); +template void ipack25(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask25(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack26(__m128i, const __m128i *, uint32_t *); +template void ipack26(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask26(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack27(__m128i, const __m128i *, uint32_t *); +template void ipack27(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask27(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack28(__m128i, const __m128i *, uint32_t *); +template void ipack28(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask28(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack29(__m128i, const __m128i *, uint32_t *); +template void ipack29(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask29(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack30(__m128i, const __m128i *, uint32_t *); +template void ipack30(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask30(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack31(__m128i, const __m128i *, uint32_t *); +template void ipack31(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask31(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack32(__m128i, const __m128i *, uint32_t *); +template void ipack32(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask32(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack0(__m128i, const __m128i *, uint32_t *); +template void ipack0(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack1(__m128i, const __m128i *, uint32_t *); +template void ipack1(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask1(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack2(__m128i, const __m128i *, uint32_t *); +template void ipack2(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask2(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack3(__m128i, const __m128i *, uint32_t *); +template void ipack3(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask3(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack4(__m128i, const __m128i *, uint32_t *); +template void ipack4(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask4(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack5(__m128i, const __m128i *, uint32_t *); +template void ipack5(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask5(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack6(__m128i, const __m128i *, uint32_t *); +template void ipack6(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask6(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack7(__m128i, const __m128i *, uint32_t *); +template void ipack7(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask7(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack8(__m128i, const __m128i *, uint32_t *); +template void ipack8(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask8(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack9(__m128i, const __m128i *, uint32_t *); +template void ipack9(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask9(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack10(__m128i, const __m128i *, uint32_t *); +template void ipack10(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask10(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack11(__m128i, const __m128i *, uint32_t *); +template void ipack11(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask11(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack12(__m128i, const __m128i *, uint32_t *); +template void ipack12(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask12(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack13(__m128i, const __m128i *, uint32_t *); +template void ipack13(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask13(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack14(__m128i, const __m128i *, uint32_t *); +template void ipack14(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask14(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack15(__m128i, const __m128i *, uint32_t *); +template void ipack15(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask15(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack16(__m128i, const __m128i *, uint32_t *); +template void ipack16(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask16(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack17(__m128i, const __m128i *, uint32_t *); +template void ipack17(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask17(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack18(__m128i, const __m128i *, uint32_t *); +template void ipack18(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask18(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack19(__m128i, const __m128i *, uint32_t *); +template void ipack19(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask19(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack20(__m128i, const __m128i *, uint32_t *); +template void ipack20(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask20(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack21(__m128i, const __m128i *, uint32_t *); +template void ipack21(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask21(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack22(__m128i, const __m128i *, uint32_t *); +template void ipack22(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask22(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack23(__m128i, const __m128i *, uint32_t *); +template void ipack23(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask23(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack24(__m128i, const __m128i *, uint32_t *); +template void ipack24(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask24(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack25(__m128i, const __m128i *, uint32_t *); +template void ipack25(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask25(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack26(__m128i, const __m128i *, uint32_t *); +template void ipack26(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask26(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack27(__m128i, const __m128i *, uint32_t *); +template void ipack27(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask27(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack28(__m128i, const __m128i *, uint32_t *); +template void ipack28(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask28(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack29(__m128i, const __m128i *, uint32_t *); +template void ipack29(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask29(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack30(__m128i, const __m128i *, uint32_t *); +template void ipack30(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask30(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack31(__m128i, const __m128i *, uint32_t *); +template void ipack31(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask31(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack32(__m128i, const __m128i *, uint32_t *); +template void ipack32(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask32(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack0(__m128i, const __m128i *, uint32_t *); +template void ipack0(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack1(__m128i, const __m128i *, uint32_t *); +template void ipack1(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask1(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack2(__m128i, const __m128i *, uint32_t *); +template void ipack2(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask2(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack3(__m128i, const __m128i *, uint32_t *); +template void ipack3(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask3(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack4(__m128i, const __m128i *, uint32_t *); +template void ipack4(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask4(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack5(__m128i, const __m128i *, uint32_t *); +template void ipack5(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask5(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack6(__m128i, const __m128i *, uint32_t *); +template void ipack6(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask6(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack7(__m128i, const __m128i *, uint32_t *); +template void ipack7(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask7(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack8(__m128i, const __m128i *, uint32_t *); +template void ipack8(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask8(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack9(__m128i, const __m128i *, uint32_t *); +template void ipack9(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask9(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack10(__m128i, const __m128i *, uint32_t *); +template void ipack10(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask10(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack11(__m128i, const __m128i *, uint32_t *); +template void ipack11(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask11(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack12(__m128i, const __m128i *, uint32_t *); +template void ipack12(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask12(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack13(__m128i, const __m128i *, uint32_t *); +template void ipack13(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask13(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack14(__m128i, const __m128i *, uint32_t *); +template void ipack14(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask14(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack15(__m128i, const __m128i *, uint32_t *); +template void ipack15(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask15(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack16(__m128i, const __m128i *, uint32_t *); +template void ipack16(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask16(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack17(__m128i, const __m128i *, uint32_t *); +template void ipack17(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask17(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack18(__m128i, const __m128i *, uint32_t *); +template void ipack18(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask18(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack19(__m128i, const __m128i *, uint32_t *); +template void ipack19(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask19(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack20(__m128i, const __m128i *, uint32_t *); +template void ipack20(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask20(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack21(__m128i, const __m128i *, uint32_t *); +template void ipack21(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask21(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack22(__m128i, const __m128i *, uint32_t *); +template void ipack22(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask22(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack23(__m128i, const __m128i *, uint32_t *); +template void ipack23(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask23(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack24(__m128i, const __m128i *, uint32_t *); +template void ipack24(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask24(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack25(__m128i, const __m128i *, uint32_t *); +template void ipack25(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask25(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack26(__m128i, const __m128i *, uint32_t *); +template void ipack26(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask26(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack27(__m128i, const __m128i *, uint32_t *); +template void ipack27(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask27(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack28(__m128i, const __m128i *, uint32_t *); +template void ipack28(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask28(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack29(__m128i, const __m128i *, uint32_t *); +template void ipack29(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask29(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack30(__m128i, const __m128i *, uint32_t *); +template void ipack30(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask30(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack31(__m128i, const __m128i *, uint32_t *); +template void ipack31(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask31(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack32(__m128i, const __m128i *, uint32_t *); +template void ipack32(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask32(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack0(__m128i, const __m128i *, uint32_t *); +template void ipack0(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask0(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack1(__m128i, const __m128i *, uint32_t *); +template void ipack1(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask1(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack2(__m128i, const __m128i *, uint32_t *); +template void ipack2(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask2(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack3(__m128i, const __m128i *, uint32_t *); +template void ipack3(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask3(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack4(__m128i, const __m128i *, uint32_t *); +template void ipack4(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask4(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack5(__m128i, const __m128i *, uint32_t *); +template void ipack5(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask5(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack6(__m128i, const __m128i *, uint32_t *); +template void ipack6(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask6(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack7(__m128i, const __m128i *, uint32_t *); +template void ipack7(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask7(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack8(__m128i, const __m128i *, uint32_t *); +template void ipack8(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask8(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack9(__m128i, const __m128i *, uint32_t *); +template void ipack9(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask9(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack10(__m128i, const __m128i *, uint32_t *); +template void ipack10(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask10(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack11(__m128i, const __m128i *, uint32_t *); +template void ipack11(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask11(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack12(__m128i, const __m128i *, uint32_t *); +template void ipack12(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask12(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack13(__m128i, const __m128i *, uint32_t *); +template void ipack13(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask13(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack14(__m128i, const __m128i *, uint32_t *); +template void ipack14(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask14(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack15(__m128i, const __m128i *, uint32_t *); +template void ipack15(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask15(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack16(__m128i, const __m128i *, uint32_t *); +template void ipack16(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask16(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack17(__m128i, const __m128i *, uint32_t *); +template void ipack17(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask17(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack18(__m128i, const __m128i *, uint32_t *); +template void ipack18(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask18(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack19(__m128i, const __m128i *, uint32_t *); +template void ipack19(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask19(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack20(__m128i, const __m128i *, uint32_t *); +template void ipack20(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask20(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack21(__m128i, const __m128i *, uint32_t *); +template void ipack21(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask21(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack22(__m128i, const __m128i *, uint32_t *); +template void ipack22(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask22(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack23(__m128i, const __m128i *, uint32_t *); +template void ipack23(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask23(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack24(__m128i, const __m128i *, uint32_t *); +template void ipack24(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask24(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack25(__m128i, const __m128i *, uint32_t *); +template void ipack25(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask25(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack26(__m128i, const __m128i *, uint32_t *); +template void ipack26(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask26(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack27(__m128i, const __m128i *, uint32_t *); +template void ipack27(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask27(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack28(__m128i, const __m128i *, uint32_t *); +template void ipack28(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask28(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack29(__m128i, const __m128i *, uint32_t *); +template void ipack29(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask29(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack30(__m128i, const __m128i *, uint32_t *); +template void ipack30(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask30(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack31(__m128i, const __m128i *, uint32_t *); +template void ipack31(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask31(__m128i, const uint32_t *, __m128i *); + + +template __m128i iunpack32(__m128i, const __m128i *, uint32_t *); +template void ipack32(__m128i, const uint32_t *, __m128i *); +template void ipackwithoutmask32(__m128i, const uint32_t *, __m128i *); + diff --git a/src/testcodecs.cpp b/src/testcodecs.cpp new file mode 100644 index 0000000..de9d635 --- /dev/null +++ b/src/testcodecs.cpp @@ -0,0 +1,192 @@ +#include +#include "synthetic.h" +#include "binarypacking.h" +#include "simdbinarypacking.h" +#include "simdfastpfor.h" +#include "timer.h" +#include "delta.h" +#include "variablebyte.h" +#include "compositecodec.h" +#include "codecfactory.h" + +struct dataarray { + dataarray() : + name(), data() { + } + string name; + vector> data; +}; + + +class EntropyRecorder { +public: + EntropyRecorder() : + counter(), totallength(0) { + } + + void clear() { + counter.clear(); + totallength = 0; + } + void eat(const uint32_t * in, const size_t length) { + if (length == 0) + return; + totallength += length; + for (uint32_t k = 0; k < length; ++k, ++in) { + maptype::iterator i = counter.find(*in); + if (i != counter.end()) + i->second += 1; + else + counter[*in] = 1; + } + } + + double computeShannon() { + double total = 0; + for (maptype::iterator i = counter.begin(); i + != counter.end(); ++i) { + const double x = static_cast(i->second); + total += x / static_cast(totallength) * log(static_cast(totallength) / x) / log(2.0); + } + return total; + } + + __attribute__ ((pure)) + double computeDataBits() { + double total = 0; + for (maptype::const_iterator i = counter.begin(); i + != counter.end(); ++i) { + total += static_cast(i->second) / static_cast(totallength) * static_cast(gccbits(i->first)); + } + return total; + } + typedef unordered_map maptype; + maptype counter; + size_t totallength; +}; + + + +void sillybenchmark(vector datas, + vector & compressedbuffer, vector & recoverybuffer, + IntegerCODEC & codec) { + cout << "#benchmarking " << CODECFactory::getName(codec) << endl;//codec.name() + WallClockTimer z; + double packtime, unpacktime; + cout << "#name , bits/int , coding speed (mis) , decoding speed (mis)" + << endl; + for (vector::const_iterator it = datas.begin(); it + != datas.end(); ++it) { + const vector > & data = it->data; + vector membuffer; + packtime = 0; + unpacktime = 0; + double compsize = 0; + double intcounter = 0; + // dry run + for(const vector & D : data) { + vector < uint32_t > dirtycopy(D); + size_t nvalue = compressedbuffer.size(); + codec.encodeArray(dirtycopy.data(), dirtycopy.size(), + compressedbuffer.data(), nvalue); + size_t recoveredvalues = recoverybuffer.size(); + codec.decodeArray(compressedbuffer.data(), nvalue, + recoverybuffer.data(), recoveredvalues); + if(recoveredvalues != dirtycopy.size()) throw runtime_error("bug"); + } + // actual run + for(const vector & D : data) { + vector < uint32_t > dirtycopy(D); + intcounter += static_cast(dirtycopy.size()); + size_t nvalue = compressedbuffer.size(); + z.reset(); + codec.encodeArray(dirtycopy.data(), dirtycopy.size(), + compressedbuffer.data(), nvalue); + packtime += static_cast (z.split()); + compsize += static_cast (nvalue); + size_t recoveredvalues = recoverybuffer.size(); + double bestunpacktime = std::numeric_limits::infinity(); + for(int t = 0; t<5; ++t) { + z.reset(); + codec.decodeArray(compressedbuffer.data(), nvalue, + recoverybuffer.data(), recoveredvalues); + double tup = static_cast (z.split()); + if(tupname << "\t" + << (static_cast (compsize) * 32.0 / intcounter + ) << "\t" + << intcounter + / static_cast (packtime) << "\t" + << intcounter + / static_cast (unpacktime) << endl; + } + cout << endl; +} + + + +void benchmark(const uint32_t S, vector> & allcodecs) { + const uint32_t N = 1U << S; + cout<<"# using arrays of size "< datas; + cout<<"#generating data..."; + cout<( + round( + 40.0*static_cast(1U<<16) / static_cast(N) + ) + ); + if(Times == 0) Times = 1; + cout<<"# Generating "< copy(X.data.back()); + delta(0U,copy.data(),copy.size()); + er.eat(copy.data(),copy.size()); + } + cout<<"#entropy of "< compressedbuffer; + compressedbuffer.resize(N * 2); + vector < uint32_t > recoverybuffer; + recoverybuffer.resize(N); + for(auto i : allcodecs) + sillybenchmark(datas,compressedbuffer,recoverybuffer,*i); +} + +void displayUsage() { + cout << "run as testcodecs nameofcodec1 nameofcodec2 ..." << endl; + cout << "where codecs are:" << endl; + vector < string > all = CODECFactory::allNames(); + for (auto i = all.begin(); i != all.end(); ++i) { + cout << *i << endl; + } +} +int main(int argc, char **argv) { + if (argc <= 1) { + displayUsage(); + return -1; + } + vector> allcodecs; + for (int k = 1; k < argc; ++k) { + shared_ptr p = CODECFactory::getFromName(argv[k]); + if (p.get() == NULL) + return -2; + allcodecs.push_back(p); + } + benchmark(16, allcodecs); + return 0; + +} diff --git a/src/testintegration.cpp b/src/testintegration.cpp new file mode 100644 index 0000000..fb7bb7a --- /dev/null +++ b/src/testintegration.cpp @@ -0,0 +1,177 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + */ + +#include "common.h" +#include "util.h" +#include "timer.h" +#include "bitpackinghelpers.h" +#include "simdbitpackinghelpers.h" +#include "delta.h" +#include "synthetic.h" + +using namespace std; + + + +vector maskedcopy(const vector & in, const uint32_t bit) { + vector out (in); + if (bit == 32) + return out; + for (auto i = out.begin(); i != out.end(); ++i) { + *i = *i % (1U << bit); + } + return out; +} + +template +bool equalOnFirstBits(const container32bit & data, + const container32bit & recovered, uint32_t bit) { + if (bit == 32) { + return data == recovered; + } + for (uint32_t k = 0; k < data.size(); ++k) { + if (data[k] % (1U << bit) != recovered[k] % (1U << bit)) { + cout << " They differ at k = " << k << " data[k]= " << data[k] + << " recovered[k]=" << recovered[k] << endl; + return false; + } + } + return true; +} + +uint32_t mask(uint32_t bit) { + if(bit == 32) return 0xFFFFFFFFU; + return (1U< +void simplebenchmark(uint32_t N = 1U << 16, uint32_t T = 1U << 9) { + T = T + 1; // we have a warming up pass + uint32_t bogus = 0; + vector data(N); + vector compressed(N); + vector icompressed(N); + vector recovered(N); + WallClockTimer z; + double unpacktime; + double iunpacktime; + + cout << "#million of integers per second: higher is better" << endl; + cout << "#bit, unpack,iunpack" << endl; + + + for (uint32_t bitindex = 0; bitindex < 32; ++bitindex) { + uint32_t bit = bitindex + 1; + vector < uint32_t > initdata(N); + for(size_t i = 0 ; 4*i < data.size() ; i+=4) { + initdata[i] = random(bit) + (i>=4? initdata[i-4]:0); + for(size_t j = 1; j < 4; ++j) { + initdata[i+j] = initdata[i]; + } + } + + //Helper::GenRandom(initdata, bit); + + const vector refdata = initdata; + vector().swap(initdata); + + icompressed.clear(); + // 4 * N should be enough for all schemes + icompressed.resize(4 * N, 0); + compressed.clear(); + // 4 * N should be enough for all schemes + compressed.resize(4 * N, 0); + recovered.clear(); + recovered.resize(N, 0); + + if (needPaddingTo128Bits(recovered.data())) { + throw logic_error("Array is not aligned on 128 bit boundary!"); + } + if (needPaddingTo128Bits(icompressed.data())) { + throw logic_error("Array is not aligned on 128 bit boundary!"); + } + if (needPaddingTo128Bits(compressed.data())) { + throw logic_error("Array is not aligned on 128 bit boundary!"); + } + if (needPaddingTo128Bits(refdata.data())) { + throw logic_error("Array is not aligned on 128 bit boundary!"); + } + + //Helper::CheckMaxDiff(refdata, bit); + + for (uint32_t repeat = 0; repeat < 1; ++repeat) { + + unpacktime = 0; + + iunpacktime = 0; + + for (uint32_t t = 0; t <= T; ++t) { + + assert(data.size() == refdata.size()); + fill(icompressed.begin(), icompressed.end(), 0); + fill(recovered.begin(), recovered.end(), 0); + memcpy(data.data(), refdata.data(), data.size() * sizeof(decltype(data)::value_type));//memcpy can be slow + Helper::pack(data.data(), data.size(), icompressed.data(), bit); + z.reset(); + Helper::unpack(icompressed.data(), refdata.size(), recovered.data(), bit); + if (t > 0) + unpacktime += static_cast(z.split()); + if (!equalOnFirstBits(refdata, recovered, bit)) { + cout << " Bug 1a " << bit << endl; + return; + } + memcpy(data.data(), refdata.data(), data.size() * sizeof(decltype(data)::value_type));//memcpy can be slow + Helper::pack(data.data(), data.size(), icompressed.data(), bit); + + z.reset(); + Helper::iunpack(icompressed.data(), refdata.size(), recovered.data(), bit); + if (t > 0) + iunpacktime += static_cast(z.split()); + if (!equalOnFirstBits(refdata, recovered, bit)) { + cout << " Bug 2 " << bit << endl; + return; + } + } + + cout << std::setprecision(4) << bit << "\t\t"; + cout << "\t\t" << N * T / (unpacktime) << "\t\t"; + + cout << "\t\t"<< N * T / (iunpacktime); + + cout << endl; + } + + } + cout<<"# ignore this "<< bogus < >(1U << 12, 1U << 14); + cout< >(1U << 12, 1U << 14); + cout< >(1U << 12, 1U << 14); + cout< >(1U << 12, 1U << 14); + cout<(1U << 12, 1U << 14); + cout< data; +}; + + +void testSmall(vector> codecs) { + vector data; + data.push_back(1U); + data.push_back(3U); + data.push_back(5U); + data.push_back(15U+1024U); + data.push_back(21U+1024U); + + for(shared_ptr codec : codecs) { + vector dirtycopy(data); + vector compressedbuffer(data.size() + 1024); + vector recoverybuffer(data.size() + 1024); + size_t nvalue = compressedbuffer.size(); + codec->encodeArray(dirtycopy.data(),dirtycopy.size(),compressedbuffer.data(),nvalue); + size_t recoveredvalues = recoverybuffer.size(); + codec->decodeArray(compressedbuffer.data(),nvalue,recoverybuffer.data(),recoveredvalues); + recoverybuffer.resize(recoveredvalues); + if(data != recoverybuffer) { + cout<<"Problem with "<name()< datas,vector & compressedbuffer, + vector & recoverybuffer, IntegerCODEC & codec) { + for(vector::const_iterator i = datas.begin() ; + i!= datas.end() ; ++i) { + const vector & data = i->data; + vector dirtycopy(data); + size_t nvalue = compressedbuffer.size(); + codec.encodeArray(dirtycopy.data(),dirtycopy.size(),compressedbuffer.data(),nvalue); + size_t recoveredvalues = recoverybuffer.size(); + codec.decodeArray(compressedbuffer.data(),nvalue,recoverybuffer.data(),recoveredvalues); + recoverybuffer.resize(recoveredvalues); + if(data != recoverybuffer) { + cout<<"Problem with "<5) { + cout<<"..."<> & allcodecs,const uint32_t S ) { + const uint32_t N = 1U << S; + vector datas; + dataarray X; + vector < uint32_t > d(N); + for (uint32_t k = 0; k < N; ++k) + d[k] = k; + X.data = d; + if (deltacode) + delta(0u, X.data.data(), X.data.size()); + + ostringstream convert; + convert << N; + X.name = convert.str(); + datas.push_back(X); + vector compressedbuffer; + compressedbuffer.resize(N*2); + vector recoverybuffer; + recoverybuffer.resize(N); + for(auto i : allcodecs) + sillyunittest(datas,compressedbuffer,recoverybuffer,*i); +} + + +void unit(bool deltacode, vector> & allcodecs,const uint32_t S ,int seed) { + const uint32_t N = 1U << S; + ClusteredDataGenerator cdg(seed); + + vector datas; + uint32_t NUMBER = 1;// Increase as needed + for(uint32_t gap = 1; gap + S <= 31; gap+=1) { + for(uint32_t T= 0;T compressedbuffer; + compressedbuffer.resize(N*2); + vector recoverybuffer; + recoverybuffer.resize(N); + for(auto i : allcodecs) + sillyunittest(datas,compressedbuffer,recoverybuffer,*i); +} + + +void tellmeaboutmachine() { + cout << "number of bytes in ostream::pos_type = " + << sizeof(ostream::pos_type) << endl; + cout << "number of bytes in size_t = " << sizeof(size_t) << endl; + cout << "number of bytes in int = " << sizeof(int) << endl; + cout << "number of bytes in long = " << sizeof(long) << endl; +#if __LITTLE_ENDIAN__ + cout << "you have little endian machine" << endl; +#endif +#if __BIG_ENDIAN__ + cout << "you have a big endian machine" << endl; +#endif +#if __CHAR_BIT__ + if (__CHAR_BIT__ != 8) + cout << "on your machine, chars don't have 8bits???" << endl; +#endif +#if __GNUG__ + cout << "GNU GCC compiler detected." << endl; +#else + cout << "Non-GCC compiler." << endl; +#endif + +} +int main() { + vector> allcodecs = CODECFactory::allSchemes(); + + testSmall(allcodecs); + + + for(int k = 0; k<10;++k) { + cout<(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask2(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask3(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask5(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask6(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask7(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask9(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask10(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask11(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask12(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask13(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask14(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask15(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask17(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask18(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask19(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask20(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask21(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask22(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask23(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask24(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask25(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask26(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask27(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask28(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask29(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask30(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask31(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask32(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpackwithoutmask4(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; + + for(uint32_t outer=0; outer< 4 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_loadu_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_loadu_si128(in+4); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_loadu_si128(in+5); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_loadu_si128(in+6); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_loadu_si128(in+7); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +void __uSIMD_fastpackwithoutmask8(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; + + for(uint32_t outer=0; outer< 8 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_loadu_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +void __uSIMD_fastpackwithoutmask16(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + __m128i InReg; + + for(uint32_t outer=0; outer< 16 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + +void __uSIMD_fastpack1(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<1)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack2(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack3(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack5(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack6(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack7(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack9(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack10(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack11(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack12(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack13(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack14(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack15(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack17(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack18(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack19(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack20(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack21(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack22(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack23(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack24(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack25(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack26(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack27(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack28(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack29(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack30(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack31(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack32(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg; + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + +void __uSIMD_fastpack4(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + + for(uint32_t outer=0; outer< 4 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+4), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+5), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+6), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+7), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +void __uSIMD_fastpack8(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + + for(uint32_t outer=0; outer< 8 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +void __uSIMD_fastpack16(const uint32_t * __restrict__ _in, __m128i * __restrict__ out) { + const __m128i *in = reinterpret_cast(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + + for(uint32_t outer=0; outer< 16 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + + +void __uSIMD_fastunpack1(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg1 = _mm_loadu_si128(in); + __m128i InReg2 = InReg1; + __m128i OutReg1, OutReg2, OutReg3, OutReg4; + const __m128i mask = _mm_set1_epi32(1); + + unsigned shift = 0; + + for (unsigned i = 0; i < 8; ++i) { + OutReg1 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg2 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + OutReg3 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg4 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + _mm_storeu_si128(out++, OutReg1); + _mm_storeu_si128(out++, OutReg2); + _mm_storeu_si128(out++, OutReg3); + _mm_storeu_si128(out++, OutReg4); + } +} + + + + +void __uSIMD_fastunpack2(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack3(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,27) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack4(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack5(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack6(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack7(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack8(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack9(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack10(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack11(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack12(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack13(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack14(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack15(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack16(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack17(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack18(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack19(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack20(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack21(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack22(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack23(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack24(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack25(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack26(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack27(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack28(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack29(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack30(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack31(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + + __m128i* out = reinterpret_cast<__m128i*>(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,1) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +void __uSIMD_fastunpack32(const __m128i* __restrict__ in, uint32_t * __restrict__ _out) { + __m128i* out = reinterpret_cast<__m128i*>(_out); + for(uint32_t outer=0; outer< 32 ;++outer) { + _mm_storeu_si128(out++, _mm_loadu_si128(in++)); + } +}