From 590f3f0ec6d4b1b2b54859577b221ba8cc11e291 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Sun, 26 Feb 2017 22:03:58 +0100 Subject: [PATCH 01/37] Add AVX2 popcount algorithm --- CMakeLists.txt | 52 +++++-- include/popcnt.hpp | 92 +----------- src/BitSieve-popcnt.cpp | 309 ++++++++++++++++++++++++++++++++++++++++ src/BitSieve.cpp | 92 ++++-------- 4 files changed, 383 insertions(+), 162 deletions(-) create mode 100644 src/BitSieve-popcnt.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index af33ec780..fb2987231 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,13 +8,12 @@ set(CMAKE_BUILD_TYPE Release) # Build options ###################################################### -option(ENABLE_POPCNT "Enable POPCNT instruction" ON) option(ENABLE_MPI "Enable MPI (Message Passing Interface)" OFF) # Include Check* ##################################################### -include(CheckCXXCompilerFlag) include(CheckCXXSourceRuns) +include(CheckCXXSourceCompiles) include(CheckTypeSize) # primecount binary source files ##################################### @@ -26,6 +25,7 @@ set(BIN_SRC src/app/cmdoptions.cpp # primecount library source files #################################### set(LIB_SRC src/BitSieve.cpp + src/BitSieve-popcnt.cpp src/FactorTable.cpp src/Li.cpp src/P2.cpp @@ -107,15 +107,51 @@ if (OPENMP_FOUND) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() -# Check for -mpopcnt compiler option ################################# +# Reset CMAKE_CXX_FLAGS for compiler checks ########################## -if(ENABLE_POPCNT) - CHECK_CXX_COMPILER_FLAG(-mpopcnt MPOPCNT_FLAG) - if(MPOPCNT_FLAG) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") - endif() +set(CXX_FLAGS "${CMAKE_CXX_FLAGS}") +set(CMAKE_CXX_FLAGS "") + +# Check -mpopcnt compiler flag ####################################### + +set(CMAKE_CXX_FLAGS "-mpopcnt -Werror") + +check_cxx_source_compiles( + "int main(int, char**) { return 0; }" + mpopcnt) + +if(mpopcnt) + set(CXX_FLAGS "${CXX_FLAGS} -mpopcnt") endif() +# Check -mpopcntd compiler flag ###################################### + +set(CMAKE_CXX_FLAGS "-mpopcntd -Werror") + +check_cxx_source_compiles( + "int main(int, char**) { return 0; }" + mpopcntd) + +if(mpopcntd) + set(CXX_FLAGS "${CXX_FLAGS} -mpopcntd") +endif() + +# Check -mavx2 compiler flag ######################################### + +set(CMAKE_CXX_FLAGS "-mavx2 -Werror") + +check_cxx_source_compiles( + "int main(int, char**) { return 0; }" + mavx2) + +if(mavx2) + set_source_files_properties(src/BitSieve-popcnt.cpp PROPERTIES COMPILE_FLAGS -mavx2) +endif() + +# Restore CMAKE_CXX_FLAGS ############################################ + +set(CMAKE_CXX_FLAGS "${CXX_FLAGS}") + # Check if int128_t type exists ###################################### check_type_size("int128_t" INT128_T) diff --git a/include/popcnt.hpp b/include/popcnt.hpp index c95e1d3bc..af730787c 100644 --- a/include/popcnt.hpp +++ b/include/popcnt.hpp @@ -1,7 +1,7 @@ /// -/// @file popcnt.hpp -/// @brief Functions to count the number of 1 bits inside a 64-bit -/// word or a 64-bit array. +/// @file popcnt.hpp +/// @brief Functions to count the number of 1 bits +/// inside a 64-bit word. /// /// Copyright (C) 2016 Kim Walisch, /// @@ -59,9 +59,6 @@ inline uint64_t popcnt64(uint64_t x) #else -/// Fallback mode if POPCNT intrinsic is not available -#define NO_POPCNT_INTRINSIC - inline uint64_t popcnt64(uint64_t x) { const uint64_t m1 = 0x5555555555555555ll; @@ -76,89 +73,6 @@ inline uint64_t popcnt64(uint64_t x) return (x * h01) >> 56; } -inline void CSA(uint64_t& h, uint64_t& l, uint64_t a, uint64_t b, uint64_t c) -{ - uint64_t u = a ^ b; - h = (a & b) | (u & c); - l = u ^ c; -} - -/// Harley-Seal popcount (4th iteration). -/// The Harley-Seal popcount algorithm is one of the fastest algorithms -/// for counting 1 bits in an array using only integer operations. -/// This implementation uses only 5.69 instructions per 64-bit word. -/// @see Chapter 5 in "Hacker's Delight" 2nd edition. -/// -inline uint64_t popcnt64(const uint64_t* data, uint64_t size) -{ - uint64_t total = 0; - uint64_t ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0; - uint64_t twosA, twosB, foursA, foursB, eightsA, eightsB; - uint64_t limit = size - size % 16; - uint64_t i = 0; - - for(; i < limit; i += 16) - { - CSA(twosA, ones, ones, data[i+0], data[i+1]); - CSA(twosB, ones, ones, data[i+2], data[i+3]); - CSA(foursA, twos, twos, twosA, twosB); - CSA(twosA, ones, ones, data[i+4], data[i+5]); - CSA(twosB, ones, ones, data[i+6], data[i+7]); - CSA(foursB, twos, twos, twosA, twosB); - CSA(eightsA,fours, fours, foursA, foursB); - CSA(twosA, ones, ones, data[i+8], data[i+9]); - CSA(twosB, ones, ones, data[i+10], data[i+11]); - CSA(foursA, twos, twos, twosA, twosB); - CSA(twosA, ones, ones, data[i+12], data[i+13]); - CSA(twosB, ones, ones, data[i+14], data[i+15]); - CSA(foursB, twos, twos, twosA, twosB); - CSA(eightsB, fours, fours, foursA, foursB); - CSA(sixteens, eights, eights, eightsA, eightsB); - - total += popcnt64(sixteens); - } - - total *= 16; - total += 8 * popcnt64(eights); - total += 4 * popcnt64(fours); - total += 2 * popcnt64(twos); - total += 1 * popcnt64(ones); - - for(; i < size; i++) - total += popcnt64(data[i]); - - return total; -} - -#endif - -#if !defined(NO_POPCNT_INTRINSIC) - -/// Count the number of 1 bits inside the data array -/// using the POPCNT instruction. -/// -inline uint64_t popcnt64(const uint64_t* data, uint64_t size) -{ - uint64_t sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0; - uint64_t limit = size - size % 4; - uint64_t i = 0; - - for (; i < limit; i += 4) - { - sum0 += popcnt64(data[i+0]); - sum1 += popcnt64(data[i+1]); - sum2 += popcnt64(data[i+2]); - sum3 += popcnt64(data[i+3]); - } - - uint64_t total = sum0 + sum1 + sum2 + sum3; - - for (; i < size; i++) - total += popcnt64(data[i]); - - return total; -} - #endif #endif /* POPCNT_HPP */ diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp new file mode 100644 index 000000000..2f76cd242 --- /dev/null +++ b/src/BitSieve-popcnt.cpp @@ -0,0 +1,309 @@ +/// +/// @file BitSieve-popcnt.cpp +/// @brief Count the number of 1 bits inside a 64-bit array. +/// The vectorized popcount algorithms used in this file are +/// described in the paper "Faster Population Counts using AVX2 +/// Instructions" by Wojciech Muła, Nathan Kurz, Daniel Lemire. +/// @see https://arxiv.org/abs/1611.07612 +/// @see https://github.com/WojciechMula/sse-popcount +/// +/// Copyright (C) 2016 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#include +#include + +#include +#include +#include + +namespace { +namespace DEFAULT { + +void CSA(uint64_t& h, uint64_t& l, uint64_t a, uint64_t b, uint64_t c) +{ + uint64_t u = a ^ b; + h = (a & b) | (u & c); + l = u ^ c; +} + +/// Harley-Seal popcount (4th iteration). +/// The Harley-Seal popcount algorithm is one of the fastest algorithms +/// for counting 1 bits in an array using only integer operations. +/// This implementation uses only 5.69 instructions per 64-bit word. +/// @see Chapter 5 in "Hacker's Delight" 2nd edition. +/// +uint64_t popcnt(const uint64_t* data, uint64_t size) +{ + uint64_t total = 0; + uint64_t ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0; + uint64_t twosA, twosB, foursA, foursB, eightsA, eightsB; + uint64_t limit = size - size % 16; + uint64_t i = 0; + + for(; i < limit; i += 16) + { + CSA(twosA, ones, ones, data[i+0], data[i+1]); + CSA(twosB, ones, ones, data[i+2], data[i+3]); + CSA(foursA, twos, twos, twosA, twosB); + CSA(twosA, ones, ones, data[i+4], data[i+5]); + CSA(twosB, ones, ones, data[i+6], data[i+7]); + CSA(foursB, twos, twos, twosA, twosB); + CSA(eightsA,fours, fours, foursA, foursB); + CSA(twosA, ones, ones, data[i+8], data[i+9]); + CSA(twosB, ones, ones, data[i+10], data[i+11]); + CSA(foursA, twos, twos, twosA, twosB); + CSA(twosA, ones, ones, data[i+12], data[i+13]); + CSA(twosB, ones, ones, data[i+14], data[i+15]); + CSA(foursB, twos, twos, twosA, twosB); + CSA(eightsB, fours, fours, foursA, foursB); + CSA(sixteens, eights, eights, eightsA, eightsB); + + total += popcnt64(sixteens); + } + + total *= 16; + total += 8 * popcnt64(eights); + total += 4 * popcnt64(fours); + total += 2 * popcnt64(twos); + total += 1 * popcnt64(ones); + + for(; i < size; i++) + total += popcnt64(data[i]); + + return total; +} + +} // namespace DEFAULT +} // namespace + +namespace { +namespace POPCNT { + +/// Count the number of 1 bits inside the data +/// array using the POPCNT instruction. +/// +uint64_t popcnt(const uint64_t* data, uint64_t size) +{ + uint64_t sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0; + uint64_t limit = size - size % 4; + uint64_t i = 0; + + for (; i < limit; i += 4) + { + sum0 += popcnt64(data[i+0]); + sum1 += popcnt64(data[i+1]); + sum2 += popcnt64(data[i+2]); + sum3 += popcnt64(data[i+3]); + } + + uint64_t total = sum0 + sum1 + sum2 + sum3; + + for (; i < size; i++) + total += popcnt64(data[i]); + + return total; +} + +} // namespace POPCNT +} // namespace + +#if defined(__x86_64__) && \ + defined(__GNUC__) && \ + (__GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) + +#include + +namespace { +namespace AVX2 { + +__m256i popcnt(const __m256i v) +{ + __m256i m1 = _mm256_set1_epi8(0x55); + __m256i m2 = _mm256_set1_epi8(0x33); + __m256i m4 = _mm256_set1_epi8(0x0F); + + __m256i t1 = _mm256_sub_epi8(v, (_mm256_srli_epi16(v, 1) & m1)); + __m256i t2 = _mm256_add_epi8(t1 & m2, (_mm256_srli_epi16(t1, 2) & m2)); + __m256i t3 = _mm256_add_epi8(t2, _mm256_srli_epi16(t2, 4)) & m4; + + return _mm256_sad_epu8(t3, _mm256_setzero_si256()); +} + +void CSA(__m256i& h, __m256i& l, __m256i a, __m256i b, __m256i c) +{ + __m256i u = a ^ b; + h = (a & b) | (u & c); + l = u ^ c; +} + +/// AVX2 Harley-Seal popcount (4th iteration). +/// The algorithm is based on the paper "Faster Population Counts +/// using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and +/// Wojciech Mula (23 Nov 2016). +/// @see https://arxiv.org/abs/1611.07612 +/// +uint64_t popcnt(const __m256i* data, uint64_t size) +{ + __m256i total = _mm256_setzero_si256(); + __m256i ones = _mm256_setzero_si256(); + __m256i twos = _mm256_setzero_si256(); + __m256i fours = _mm256_setzero_si256(); + __m256i eights = _mm256_setzero_si256(); + __m256i sixteens = _mm256_setzero_si256(); + __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t limit = size - size % 16; + uint64_t i = 0; + + for(; i < limit; i += 16) + { + CSA(twosA, ones, ones, data[i+0], data[i+1]); + CSA(twosB, ones, ones, data[i+2], data[i+3]); + CSA(foursA, twos, twos, twosA, twosB); + CSA(twosA, ones, ones, data[i+4], data[i+5]); + CSA(twosB, ones, ones, data[i+6], data[i+7]); + CSA(foursB, twos, twos, twosA, twosB); + CSA(eightsA,fours, fours, foursA, foursB); + CSA(twosA, ones, ones, data[i+8], data[i+9]); + CSA(twosB, ones, ones, data[i+10], data[i+11]); + CSA(foursA, twos, twos, twosA, twosB); + CSA(twosA, ones, ones, data[i+12], data[i+13]); + CSA(twosB, ones, ones, data[i+14], data[i+15]); + CSA(foursB, twos, twos, twosA, twosB); + CSA(eightsB, fours, fours, foursA, foursB); + CSA(sixteens, eights, eights, eightsA, eightsB); + + total = _mm256_add_epi64(total, popcnt(sixteens)); + } + + total = _mm256_slli_epi64(total, 4); + total = _mm256_add_epi64(total, _mm256_slli_epi64(popcnt(eights), 3)); + total = _mm256_add_epi64(total, _mm256_slli_epi64(popcnt(fours), 2)); + total = _mm256_add_epi64(total, _mm256_slli_epi64(popcnt(twos), 1)); + total = _mm256_add_epi64(total, popcnt(ones)); + + for(; i < size; i++) + total = _mm256_add_epi64(total, popcnt(data[i])); + + uint64_t* total64 = (uint64_t*) &total; + + return total64[0] + + total64[1] + + total64[2] + + total64[3]; +} + +/// Align memory to 32 bytes boundary +void align(const uint64_t*& data, uint64_t* size, uint64_t* total) +{ + for (; *size > 0 && (uintptr_t) data % 32 != 0; data++) + { + *total += popcnt64(*data); + *size -= 1; + } +} + +/// AVX2 popcount algorithm for 64-bit arrays. +/// @param data A 64-bit array +/// @param size Length of data array +/// +uint64_t popcnt(const uint64_t* data, uint64_t size) +{ + uint64_t total = 0; + + // AVX2 popcount is faster than POPCNT + // for array sizes >= 1 kilobyte + if (size * 8 >= 1024) + { + align(data, &size, &total); + total += popcnt((const __m256i*) data, size / 4); + data += size - size % 4; + size = size % 4; + } + + // process remaining words + total += POPCNT::popcnt(data, size); + + return total; +} + +} // namespace AVX2 +} // namespace + +#endif /* AVX2 */ + +/// Function multi-versioning is currently (February 2017) +/// only supported by GCC >= 4.8 +/// +#if defined(__x86_64__) && \ + defined(__GNUC__) && \ + (__GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) + +namespace { + +__attribute__ ((target ("default"))) +uint64_t popcnt(const uint64_t* data, uint64_t size) +{ + return DEFAULT::popcnt(data, size); +} + +__attribute__ ((target ("popcnt"))) +uint64_t popcnt(const uint64_t* data, uint64_t size) +{ + return POPCNT::popcnt(data, size); +} + +__attribute__ ((target ("avx2"))) +uint64_t popcnt(const uint64_t* data, uint64_t size) +{ + return AVX2::popcnt(data, size); +} + +} // namespace + +#else + +uint64_t popcnt(const uint64_t* data, uint64_t size) +{ + return POPCNT::popcnt(data, size); +} + +#endif + +namespace primecount { + +/// Count the number of 1 bits inside [start, stop] +uint64_t BitSieve::count(uint64_t start, + uint64_t stop) const +{ + if (start > stop) + return 0; + + assert(stop < size_); + + uint64_t start_idx = start / 64; + uint64_t stop_idx = stop / 64; + uint64_t m1 = 0xffffffffffffffffull << (start % 64); + uint64_t m2 = 0xffffffffffffffffull >> (63 - stop % 64); + uint64_t bit_count; + + if (start_idx == stop_idx) + bit_count = popcnt64(sieve_[start_idx] & (m1 & m2)); + else + { + bit_count = popcnt64(sieve_[start_idx] & m1); + bit_count += popcnt(&sieve_[start_idx + 1], stop_idx - (start_idx + 1)); + bit_count += popcnt64(sieve_[stop_idx] & m2); + } + + return bit_count; +} + +} // namespace + diff --git a/src/BitSieve.cpp b/src/BitSieve.cpp index 2f4874d3f..cd664f6e2 100644 --- a/src/BitSieve.cpp +++ b/src/BitSieve.cpp @@ -11,12 +11,7 @@ /// file in the top level directory. /// -#if !defined(__STDC_CONSTANT_MACROS) - #define __STDC_CONSTANT_MACROS -#endif - #include -#include #include #include @@ -35,16 +30,16 @@ const uint64_t primes[] = { 0, 2, 3, 5, 7, 11, 13, 17, 19, 23 }; /// Bitmasks with multiples of the i-th prime set const uint64_t masks[] = { - UINT64_C(0x0000000000000000), - UINT64_C(0x5555555555555555), // 2 - UINT64_C(0x9249249249249249), // 3 - UINT64_C(0x1084210842108421), // 5 - UINT64_C(0x8102040810204081), // 7 - UINT64_C(0x0080100200400801), // 11 - UINT64_C(0x0010008004002001), // 13 - UINT64_C(0x0008000400020001), // 17 - UINT64_C(0x0200004000080001), // 19 - UINT64_C(0x0000400000800001) // 23 + 0x0000000000000000ull, + 0x5555555555555555ull, // 2 + 0x9249249249249249ull, // 3 + 0x1084210842108421ull, // 5 + 0x8102040810204081ull, // 7 + 0x0080100200400801ull, // 11 + 0x0010008004002001ull, // 13 + 0x0008000400020001ull, // 17 + 0x0200004000080001ull, // 19 + 0x0000400000800001ull // 23 }; /// Get bitmask with unset multiples @@ -61,34 +56,28 @@ uint64_t fast_modulo(uint64_t x, uint64_t y) return x; } -} +} // namespace namespace primecount { const uint64_t BitSieve::unset_bit_[64] = { - ~(UINT64_C(1) << 0), ~(UINT64_C(1) << 1), ~(UINT64_C(1) << 2), - ~(UINT64_C(1) << 3), ~(UINT64_C(1) << 4), ~(UINT64_C(1) << 5), - ~(UINT64_C(1) << 6), ~(UINT64_C(1) << 7), ~(UINT64_C(1) << 8), - ~(UINT64_C(1) << 9), ~(UINT64_C(1) << 10), ~(UINT64_C(1) << 11), - ~(UINT64_C(1) << 12), ~(UINT64_C(1) << 13), ~(UINT64_C(1) << 14), - ~(UINT64_C(1) << 15), ~(UINT64_C(1) << 16), ~(UINT64_C(1) << 17), - ~(UINT64_C(1) << 18), ~(UINT64_C(1) << 19), ~(UINT64_C(1) << 20), - ~(UINT64_C(1) << 21), ~(UINT64_C(1) << 22), ~(UINT64_C(1) << 23), - ~(UINT64_C(1) << 24), ~(UINT64_C(1) << 25), ~(UINT64_C(1) << 26), - ~(UINT64_C(1) << 27), ~(UINT64_C(1) << 28), ~(UINT64_C(1) << 29), - ~(UINT64_C(1) << 30), ~(UINT64_C(1) << 31), ~(UINT64_C(1) << 32), - ~(UINT64_C(1) << 33), ~(UINT64_C(1) << 34), ~(UINT64_C(1) << 35), - ~(UINT64_C(1) << 36), ~(UINT64_C(1) << 37), ~(UINT64_C(1) << 38), - ~(UINT64_C(1) << 39), ~(UINT64_C(1) << 40), ~(UINT64_C(1) << 41), - ~(UINT64_C(1) << 42), ~(UINT64_C(1) << 43), ~(UINT64_C(1) << 44), - ~(UINT64_C(1) << 45), ~(UINT64_C(1) << 46), ~(UINT64_C(1) << 47), - ~(UINT64_C(1) << 48), ~(UINT64_C(1) << 49), ~(UINT64_C(1) << 50), - ~(UINT64_C(1) << 51), ~(UINT64_C(1) << 52), ~(UINT64_C(1) << 53), - ~(UINT64_C(1) << 54), ~(UINT64_C(1) << 55), ~(UINT64_C(1) << 56), - ~(UINT64_C(1) << 57), ~(UINT64_C(1) << 58), ~(UINT64_C(1) << 59), - ~(UINT64_C(1) << 60), ~(UINT64_C(1) << 61), ~(UINT64_C(1) << 62), - ~(UINT64_C(1) << 63) + ~(1ull << 0), ~(1ull << 1), ~(1ull << 2), ~(1ull << 3), + ~(1ull << 4), ~(1ull << 5), ~(1ull << 6), ~(1ull << 7), + ~(1ull << 8), ~(1ull << 9), ~(1ull << 10), ~(1ull << 11), + ~(1ull << 12), ~(1ull << 13), ~(1ull << 14), ~(1ull << 15), + ~(1ull << 16), ~(1ull << 17), ~(1ull << 18), ~(1ull << 19), + ~(1ull << 20), ~(1ull << 21), ~(1ull << 22), ~(1ull << 23), + ~(1ull << 24), ~(1ull << 25), ~(1ull << 26), ~(1ull << 27), + ~(1ull << 28), ~(1ull << 29), ~(1ull << 30), ~(1ull << 31), + ~(1ull << 32), ~(1ull << 33), ~(1ull << 34), ~(1ull << 35), + ~(1ull << 36), ~(1ull << 37), ~(1ull << 38), ~(1ull << 39), + ~(1ull << 40), ~(1ull << 41), ~(1ull << 42), ~(1ull << 43), + ~(1ull << 44), ~(1ull << 45), ~(1ull << 46), ~(1ull << 47), + ~(1ull << 48), ~(1ull << 49), ~(1ull << 50), ~(1ull << 51), + ~(1ull << 52), ~(1ull << 53), ~(1ull << 54), ~(1ull << 55), + ~(1ull << 56), ~(1ull << 57), ~(1ull << 58), ~(1ull << 59), + ~(1ull << 60), ~(1ull << 61), ~(1ull << 62), ~(1ull << 63) }; BitSieve::BitSieve(std::size_t size) : @@ -158,31 +147,4 @@ void BitSieve::pre_sieve(uint64_t c, uint64_t low) } } -/// Count the number of 1 bits inside [start, stop] -uint64_t BitSieve::count(uint64_t start, - uint64_t stop) const -{ - if (start > stop) - return 0; - - assert(stop < size_); - - uint64_t start_idx = start / 64; - uint64_t stop_idx = stop / 64; - uint64_t m1 = UINT64_C(0xffffffffffffffff) << (start % 64); - uint64_t m2 = UINT64_C(0xffffffffffffffff) >> (63 - stop % 64); - uint64_t bit_count; - - if (start_idx == stop_idx) - bit_count = popcnt64(sieve_[start_idx] & (m1 & m2)); - else - { - bit_count = popcnt64(sieve_[start_idx] & m1); - bit_count += popcnt64(&sieve_[start_idx + 1], stop_idx - (start_idx + 1)); - bit_count += popcnt64(sieve_[stop_idx] & m2); - } - - return bit_count; -} - } // namespace From 056961a668071851e25beded709a24f5647c956a Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Sun, 26 Feb 2017 22:29:32 +0100 Subject: [PATCH 02/37] Silence OS X warning --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb2987231..4797963f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.8.4) +cmake_minimum_required(VERSION 3.0) project(primecount) set(PRIMECOUNT_VERSION_MAJOR 3) set(PRIMECOUNT_VERSION_MINOR 5) From 0c57ad679fac77dea544449c3696ddb4a06716a0 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Sun, 26 Feb 2017 22:48:51 +0100 Subject: [PATCH 03/37] Update comment --- src/BitSieve-popcnt.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index 2f76cd242..941180f23 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -1,8 +1,8 @@ /// /// @file BitSieve-popcnt.cpp /// @brief Count the number of 1 bits inside a 64-bit array. -/// The vectorized popcount algorithms used in this file are -/// described in the paper "Faster Population Counts using AVX2 +/// The AVX2 popcount algorithm used in this file is described +/// in the paper "Faster Population Counts using AVX2 /// Instructions" by Wojciech Muła, Nathan Kurz, Daniel Lemire. /// @see https://arxiv.org/abs/1611.07612 /// @see https://github.com/WojciechMula/sse-popcount From 0c836db1634bd639057a7ed78861b52a1cd31722 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Sun, 26 Feb 2017 22:59:13 +0100 Subject: [PATCH 04/37] Remove newline --- src/BitSieve-popcnt.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index 941180f23..9c3919ac1 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -306,4 +306,3 @@ uint64_t BitSieve::count(uint64_t start, } } // namespace - From 7582d6bc6a7f47e858f755874bb6ae3a991c05f8 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 08:15:52 +0100 Subject: [PATCH 05/37] Install CMake 3.x --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 8b468277c..050f67269 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,7 @@ env: - OMP_NUM_THREADS=4 before_install: + - sudo add-apt-repository ppa:george-edison55/precise-backports - sudo apt-get update -qq # Install cmake, OpenMPI, cppcheck From f8cc9b7077cb7d21c2441328e02b617c8bf3a02f Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 08:31:27 +0100 Subject: [PATCH 06/37] Add --yes option --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 050f67269..3df3e84d0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ env: - OMP_NUM_THREADS=4 before_install: - - sudo add-apt-repository ppa:george-edison55/precise-backports + - sudo add-apt-repository ppa:george-edison55/precise-backports -y - sudo apt-get update -qq # Install cmake, OpenMPI, cppcheck From 8c4d4ae16503a12a2a5cd636aed9030f63e04c2e Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 08:33:41 +0100 Subject: [PATCH 07/37] Improved compiler flag checks --- CMakeLists.txt | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4797963f8..28db3f216 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,7 @@ option(ENABLE_MPI "Enable MPI (Message Passing Interface)" OFF) # Include Check* ##################################################### include(CheckCXXSourceRuns) -include(CheckCXXSourceCompiles) +include(CheckCXXCompilerFlag) include(CheckTypeSize) # primecount binary source files ##################################### @@ -107,43 +107,28 @@ if (OPENMP_FOUND) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() -# Reset CMAKE_CXX_FLAGS for compiler checks ########################## +# Use -Werror for compiler checks #################################### set(CXX_FLAGS "${CMAKE_CXX_FLAGS}") -set(CMAKE_CXX_FLAGS "") +set(CMAKE_CXX_FLAGS -Werror) # Check -mpopcnt compiler flag ####################################### -set(CMAKE_CXX_FLAGS "-mpopcnt -Werror") - -check_cxx_source_compiles( - "int main(int, char**) { return 0; }" - mpopcnt) - +check_cxx_compiler_flag(-mpopcnt mpopcnt) if(mpopcnt) set(CXX_FLAGS "${CXX_FLAGS} -mpopcnt") endif() # Check -mpopcntd compiler flag ###################################### -set(CMAKE_CXX_FLAGS "-mpopcntd -Werror") - -check_cxx_source_compiles( - "int main(int, char**) { return 0; }" - mpopcntd) - +check_cxx_compiler_flag(-mpopcntd mpopcntd) if(mpopcntd) set(CXX_FLAGS "${CXX_FLAGS} -mpopcntd") endif() # Check -mavx2 compiler flag ######################################### -set(CMAKE_CXX_FLAGS "-mavx2 -Werror") - -check_cxx_source_compiles( - "int main(int, char**) { return 0; }" - mavx2) - +check_cxx_compiler_flag(-mavx2 mavx2) if(mavx2) set_source_files_properties(src/BitSieve-popcnt.cpp PROPERTIES COMPILE_FLAGS -mavx2) endif() From 578cda71b07c239407b38d2ef946272081c2cde8 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 08:53:24 +0100 Subject: [PATCH 08/37] Fix CMake 3.x installation --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3df3e84d0..6dcdc13ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ before_install: # Install cmake, OpenMPI, cppcheck install: - - sudo apt-get install -y cmake openmpi-bin libopenmpi-dev cppcheck + - sudo apt-get install -y cmake cmake-data openmpi-bin libopenmpi-dev cppcheck # Test build.sh script before_script: From 9dc07a599d4246f1501387954d0c9185b5acc511 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 13:57:13 +0100 Subject: [PATCH 09/37] CMake check __attribute__ target("avx2") --- CMakeLists.txt | 49 +++++++++++++-------- src/BitSieve-popcnt.cpp | 95 +++++------------------------------------ 2 files changed, 41 insertions(+), 103 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 28db3f216..2b5069cd2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,10 +8,12 @@ set(CMAKE_BUILD_TYPE Release) # Build options ###################################################### -option(ENABLE_MPI "Enable MPI (Message Passing Interface)" OFF) +option(ENABLE_POPCNT "Enable POPCNT instruction" ON) +option(ENABLE_MPI "Enable MPI (Message Passing Interface)" OFF) # Include Check* ##################################################### +include(CheckCXXSourceCompiles) include(CheckCXXSourceRuns) include(CheckCXXCompilerFlag) include(CheckTypeSize) @@ -103,39 +105,50 @@ endif() # Check for OpenMP ################################################### find_package(OpenMP QUIET) -if (OPENMP_FOUND) +if(OPENMP_FOUND) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() # Use -Werror for compiler checks #################################### -set(CXX_FLAGS "${CMAKE_CXX_FLAGS}") +set(COPY_CXX_FLAGS "${CMAKE_CXX_FLAGS}") set(CMAKE_CXX_FLAGS -Werror) # Check -mpopcnt compiler flag ####################################### -check_cxx_compiler_flag(-mpopcnt mpopcnt) -if(mpopcnt) - set(CXX_FLAGS "${CXX_FLAGS} -mpopcnt") +if(ENABLE_POPCNT) + check_cxx_compiler_flag(-mpopcnt mpopcnt) + if(mpopcnt) + set(COPY_CXX_FLAGS "${COPY_CXX_FLAGS} -mpopcnt") + endif() endif() # Check -mpopcntd compiler flag ###################################### -check_cxx_compiler_flag(-mpopcntd mpopcntd) -if(mpopcntd) - set(CXX_FLAGS "${CXX_FLAGS} -mpopcntd") +if(ENABLE_POPCNT) + check_cxx_compiler_flag(-mpopcntd mpopcntd) + if(mpopcntd) + set(COPY_CXX_FLAGS "${COPY_CXX_FLAGS} -mpopcntd") + endif() endif() -# Check -mavx2 compiler flag ######################################### +# Restore CMAKE_CXX_FLAGS ############################################ -check_cxx_compiler_flag(-mavx2 mavx2) -if(mavx2) - set_source_files_properties(src/BitSieve-popcnt.cpp PROPERTIES COMPILE_FLAGS -mavx2) -endif() +set(CMAKE_CXX_FLAGS "${COPY_CXX_FLAGS}") -# Restore CMAKE_CXX_FLAGS ############################################ +# Check target("avx2") ############################################### + +check_cxx_source_compiles(" + __attribute__ ((target (\"default\"))) + int func() { return 0; } + __attribute__ ((target (\"avx2\"))) + int func() { return 1; } + int main() { return func(); }" + target_avx2) -set(CMAKE_CXX_FLAGS "${CXX_FLAGS}") +if(target_avx2) + add_definitions(-DHAVE_TARGET_AVX2) +endif() # Check if int128_t type exists ###################################### @@ -185,11 +198,11 @@ check_cxx_source_runs(" return 0; }" - HAVE_LIBDIVIDE) + use_libdivide) # Include S2_easy* source files ###################################### -if(HAVE_LIBDIVIDE) +if(use_libdivide) set(LIB_SRC ${LIB_SRC} src/deleglise-rivat/S2_easy_libdivide.cpp) if(ENABLE_MPI) set(LIB_SRC ${LIB_SRC} src/mpi/deleglise-rivat/S2_easy_mpi_libdivide.cpp) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index 9c3919ac1..6e48b8de4 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -20,66 +20,6 @@ #include #include -namespace { -namespace DEFAULT { - -void CSA(uint64_t& h, uint64_t& l, uint64_t a, uint64_t b, uint64_t c) -{ - uint64_t u = a ^ b; - h = (a & b) | (u & c); - l = u ^ c; -} - -/// Harley-Seal popcount (4th iteration). -/// The Harley-Seal popcount algorithm is one of the fastest algorithms -/// for counting 1 bits in an array using only integer operations. -/// This implementation uses only 5.69 instructions per 64-bit word. -/// @see Chapter 5 in "Hacker's Delight" 2nd edition. -/// -uint64_t popcnt(const uint64_t* data, uint64_t size) -{ - uint64_t total = 0; - uint64_t ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0; - uint64_t twosA, twosB, foursA, foursB, eightsA, eightsB; - uint64_t limit = size - size % 16; - uint64_t i = 0; - - for(; i < limit; i += 16) - { - CSA(twosA, ones, ones, data[i+0], data[i+1]); - CSA(twosB, ones, ones, data[i+2], data[i+3]); - CSA(foursA, twos, twos, twosA, twosB); - CSA(twosA, ones, ones, data[i+4], data[i+5]); - CSA(twosB, ones, ones, data[i+6], data[i+7]); - CSA(foursB, twos, twos, twosA, twosB); - CSA(eightsA,fours, fours, foursA, foursB); - CSA(twosA, ones, ones, data[i+8], data[i+9]); - CSA(twosB, ones, ones, data[i+10], data[i+11]); - CSA(foursA, twos, twos, twosA, twosB); - CSA(twosA, ones, ones, data[i+12], data[i+13]); - CSA(twosB, ones, ones, data[i+14], data[i+15]); - CSA(foursB, twos, twos, twosA, twosB); - CSA(eightsB, fours, fours, foursA, foursB); - CSA(sixteens, eights, eights, eightsA, eightsB); - - total += popcnt64(sixteens); - } - - total *= 16; - total += 8 * popcnt64(eights); - total += 4 * popcnt64(fours); - total += 2 * popcnt64(twos); - total += 1 * popcnt64(ones); - - for(; i < size; i++) - total += popcnt64(data[i]); - - return total; -} - -} // namespace DEFAULT -} // namespace - namespace { namespace POPCNT { @@ -111,22 +51,19 @@ uint64_t popcnt(const uint64_t* data, uint64_t size) } // namespace POPCNT } // namespace -#if defined(__x86_64__) && \ - defined(__GNUC__) && \ - (__GNUC__ > 4 || \ - (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) +#if defined(HAVE_TARGET_AVX2) #include namespace { namespace AVX2 { +__attribute__ ((target ("avx2"))) __m256i popcnt(const __m256i v) { __m256i m1 = _mm256_set1_epi8(0x55); __m256i m2 = _mm256_set1_epi8(0x33); __m256i m4 = _mm256_set1_epi8(0x0F); - __m256i t1 = _mm256_sub_epi8(v, (_mm256_srli_epi16(v, 1) & m1)); __m256i t2 = _mm256_add_epi8(t1 & m2, (_mm256_srli_epi16(t1, 2) & m2)); __m256i t3 = _mm256_add_epi8(t2, _mm256_srli_epi16(t2, 4)) & m4; @@ -134,6 +71,7 @@ __m256i popcnt(const __m256i v) return _mm256_sad_epu8(t3, _mm256_setzero_si256()); } +__attribute__ ((target ("avx2"))) void CSA(__m256i& h, __m256i& l, __m256i a, __m256i b, __m256i c) { __m256i u = a ^ b; @@ -147,6 +85,7 @@ void CSA(__m256i& h, __m256i& l, __m256i a, __m256i b, __m256i c) /// Wojciech Mula (23 Nov 2016). /// @see https://arxiv.org/abs/1611.07612 /// +__attribute__ ((target ("avx2"))) uint64_t popcnt(const __m256i* data, uint64_t size) { __m256i total = _mm256_setzero_si256(); @@ -199,16 +138,18 @@ uint64_t popcnt(const __m256i* data, uint64_t size) } /// Align memory to 32 bytes boundary -void align(const uint64_t*& data, uint64_t* size, uint64_t* total) +void align(const uint64_t*& p, + uint64_t* size, + uint64_t* total) { - for (; *size > 0 && (uintptr_t) data % 32 != 0; data++) + for (; *size > 0 && (uintptr_t) p % 32 != 0; p++) { - *total += popcnt64(*data); + *total += popcnt64(*p); *size -= 1; } } -/// AVX2 popcount algorithm for 64-bit arrays. +/// AVX2 popcount algorithm. /// @param data A 64-bit array /// @param size Length of data array /// @@ -235,26 +176,10 @@ uint64_t popcnt(const uint64_t* data, uint64_t size) } // namespace AVX2 } // namespace -#endif /* AVX2 */ - -/// Function multi-versioning is currently (February 2017) -/// only supported by GCC >= 4.8 -/// -#if defined(__x86_64__) && \ - defined(__GNUC__) && \ - (__GNUC__ > 4 || \ - (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) - namespace { __attribute__ ((target ("default"))) uint64_t popcnt(const uint64_t* data, uint64_t size) -{ - return DEFAULT::popcnt(data, size); -} - -__attribute__ ((target ("popcnt"))) -uint64_t popcnt(const uint64_t* data, uint64_t size) { return POPCNT::popcnt(data, size); } From d01354855463cd01322c337b6ae01a84f4cfabe3 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 18:17:16 +0100 Subject: [PATCH 10/37] Update year --- COPYING | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/COPYING b/COPYING index 07dc6aed2..f5323e3b5 100644 --- a/COPYING +++ b/COPYING @@ -1,6 +1,6 @@ BSD 2-Clause License -Copyright (c) 2013 - 2016, Kim Walisch. +Copyright (c) 2013 - 2017, Kim Walisch. All rights reserved. Redistribution and use in source and binary forms, with or without From 8c20892ab70aa4881675afd9327b31a02377d799 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 18:18:55 +0100 Subject: [PATCH 11/37] Update copyright year --- src/app/help.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/app/help.cpp b/src/app/help.cpp index 05600f297..94a2db17d 100644 --- a/src/app/help.cpp +++ b/src/app/help.cpp @@ -58,7 +58,7 @@ const string helpMenu( const string versionInfo( "primecount " PRIMECOUNT_VERSION ", \n" - "Copyright (C) 2016 Kim Walisch\n" + "Copyright (C) 2013 - 2017 Kim Walisch\n" "BSD 2-Clause License " ); From 7f5744c69e0d3c756e8f6736eea3a295346059ae Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 18:31:27 +0100 Subject: [PATCH 12/37] Changes in version 3.6 --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index 6560689ba..7791412d1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2017-02-27 Kim Walisch + + Version 3.6 work in progress. + + This version features a new AVX2 popcount algorithm which + computes the hard special leaves up to 15% faster on x86 CPUs + with AVX2 support (2013 or later). + + * BitSieve-popcnt.cpp: New AVX2 popcount algorithm. + * CMakeLists.txt: Add AVX2 check. + 2016-12-16 Kim Walisch Version 3.5 released. From 7ae5e8a4873a0b7b345866a4fe5e2039aa58e75d Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 20:21:48 +0100 Subject: [PATCH 13/37] Improve code readability --- src/BitSieve-popcnt.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index 6e48b8de4..7990801b8 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -176,6 +176,10 @@ uint64_t popcnt(const uint64_t* data, uint64_t size) } // namespace AVX2 } // namespace +#endif + +#if defined(HAVE_TARGET_AVX2) + namespace { __attribute__ ((target ("default"))) From c98c51710627db631c4e74842b8606cd55514b0e Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 20:36:00 +0100 Subject: [PATCH 14/37] Fix clang performance bug On x86_64 when compiled with clang++ the popcnt64() was erroneously defined as: inline uint64_t popcnt64(uint64_t x) { return __builtin_popcount((uint32_t) x) + __builtin_popcount((uint32_t)(x >> 32)); } Instead of: inline uint64_t popcnt64(uint64_t x) { return __builtin_popcountll(x); } --- include/popcnt.hpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/include/popcnt.hpp b/include/popcnt.hpp index af730787c..63cf068d1 100644 --- a/include/popcnt.hpp +++ b/include/popcnt.hpp @@ -39,18 +39,8 @@ inline uint64_t popcnt64(uint64_t x) _mm_popcnt_u32((uint32_t)(x >> 32)); } -#elif __has_builtin(__builtin_popcount) || \ - (defined(__GNUC__) && \ - defined(__i386__)) - -inline uint64_t popcnt64(uint64_t x) -{ - return __builtin_popcount((uint32_t) x) + - __builtin_popcount((uint32_t)(x >> 32)); -} - -#elif __has_builtin(__builtin_popcountll) || \ - defined(__GNUC__) +#elif defined(__GNUC__) || \ + __has_builtin(__builtin_popcountll) inline uint64_t popcnt64(uint64_t x) { From e0c6fd3d7151c03bf5d14e54bef990c289f92ee7 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 27 Feb 2017 21:17:09 +0100 Subject: [PATCH 15/37] Changes in version 3.6 --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index 7791412d1..11f0975e6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -7,6 +7,7 @@ with AVX2 support (2013 or later). * BitSieve-popcnt.cpp: New AVX2 popcount algorithm. + * popcnt.hpp: Fix clang performance bug. * CMakeLists.txt: Add AVX2 check. 2016-12-16 Kim Walisch From f89d0267d395a13dd0e93a84576e25b0f7282217 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Tue, 28 Feb 2017 10:16:21 +0100 Subject: [PATCH 16/37] Minor performance improvement --- src/BitSieve-popcnt.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index 7990801b8..d2d23e310 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -142,8 +142,9 @@ void align(const uint64_t*& p, uint64_t* size, uint64_t* total) { - for (; *size > 0 && (uintptr_t) p % 32 != 0; p++) + for (; (uintptr_t) p % 32 != 0; p++) { + assert(*size > 0); *total += popcnt64(*p); *size -= 1; } From b19ade06d560d7e99def23c6dab37f1f1d21aff8 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Tue, 28 Feb 2017 18:01:43 +0100 Subject: [PATCH 17/37] Update timings --- README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 9f46fe082..146da4aac 100644 --- a/README.md +++ b/README.md @@ -207,7 +207,7 @@ Benchmarks 3,204,941,750,802 4.01s 1.96s - 0.43s + 0.38s 0.22s @@ -215,32 +215,32 @@ Benchmarks 29,844,570,422,669 27.75s 12.08s - 1.65s - 0.77s + 1.52s + 0.76s 1016 279,238,341,033,925 232.30s 92.09s - 7.31s - 2.83s + 6.87s + 2.67s 1017 2,623,557,157,654,233 1,836.73s 731.35s - 33.66s - 11.20s + 31.63s + 10.66s 1018 24,739,954,287,740,860 14,949.16s 6,631.73s - 157.15s - 47.07s + 146.55s + 44.54s 1019 @@ -248,7 +248,7 @@ Benchmarks NaN NaN NaN - 225.08s + 209.57s 1020 @@ -256,7 +256,7 @@ Benchmarks NaN NaN NaN - 1,029.41s + 939.88s 1021 @@ -264,7 +264,7 @@ Benchmarks NaN NaN NaN - 4,867.45s + 4,536.14s 1022 @@ -278,7 +278,7 @@ Benchmarks The benchmarks above were run on an Intel Core i7-6700 CPU (4 x 3.4 GHz) from 2015 using a Linux x64 operating system and primecount was compiled using -GCC 5.2. +GCC 5.4. Build instructions ------------------ From 8beb6ac3a92263501ccf4d2d4c26d08a9e927017 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Tue, 28 Feb 2017 18:54:30 +0100 Subject: [PATCH 18/37] update copyright year --- src/BitSieve-popcnt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index d2d23e310..99a92f22b 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -7,7 +7,7 @@ /// @see https://arxiv.org/abs/1611.07612 /// @see https://github.com/WojciechMula/sse-popcount /// -/// Copyright (C) 2016 Kim Walisch, +/// Copyright (C) 2017 Kim Walisch, /// /// This file is distributed under the BSD License. See the COPYING /// file in the top level directory. From 7176e600aca350eff7d91f31c0cf746d75f94829 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Tue, 28 Feb 2017 19:12:07 +0100 Subject: [PATCH 19/37] Update version to 3.6 --- CMakeLists.txt | 2 +- README.md | 8 ++++---- doc/primecount-MPI.md | 2 +- include/primecount.hpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b5069cd2..eafe3441a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.0) project(primecount) set(PRIMECOUNT_VERSION_MAJOR 3) -set(PRIMECOUNT_VERSION_MINOR 5) +set(PRIMECOUNT_VERSION_MINOR 6) set(PRIMECOUNT_VERSION "${PRIMECOUNT_VERSION_MAJOR}.${PRIMECOUNT_VERSION_MINOR}") include_directories(include src/primesieve/include) set(CMAKE_BUILD_TYPE Release) diff --git a/README.md b/README.md index 146da4aac..af9f08a5a 100644 --- a/README.md +++ b/README.md @@ -30,9 +30,9 @@ Below are the latest precompiled binaries for Windows 64-bit, Linux and OS X. These binaries are statically linked and require a CPU which supports the POPCNT instruction (2010 or later). -* [primecount-3.5-win64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.5-win64.zip), 404K -* [primecount-3.5-linux-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.5-linux-x64.tar.gz), 915K -* [primecount-3.5-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.5-macOS-x64.tar.gz), 688K +* [primecount-3.6-win64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-win64.zip), 404K +* [primecount-3.6-linux-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-linux-x64.tar.gz), 915K +* [primecount-3.6-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.tar.gz), 688K * Binaries with backup functionality are available [here](https://github.com/kimwalisch/primecount/tree/backup#primecount-backup) Usage examples @@ -285,7 +285,7 @@ Build instructions You need to have installed a C++ compiler, cmake and make to build primecount. Download -[primecount-3.5.zip](https://github.com/kimwalisch/primecount/archive/v3.5.zip) +[primecount-3.6.zip](https://github.com/kimwalisch/primecount/archive/v3.6.zip) and build it using: ```sh diff --git a/doc/primecount-MPI.md b/doc/primecount-MPI.md index 4daa47182..dfa951563 100644 --- a/doc/primecount-MPI.md +++ b/doc/primecount-MPI.md @@ -25,7 +25,7 @@ sudo apt-get install g++ make cmake libopenmpi-dev openmpi-bin ``` Then download -[primecount-3.5.zip](https://github.com/kimwalisch/primecount/archive/v3.5.zip) +[primecount-3.6.zip](https://github.com/kimwalisch/primecount/archive/v3.6.zip) and build it using: ```sh cmake -DENABLE_MPI=ON . diff --git a/include/primecount.hpp b/include/primecount.hpp index 6d2a123a0..5d536c50c 100644 --- a/include/primecount.hpp +++ b/include/primecount.hpp @@ -13,9 +13,9 @@ #include #include -#define PRIMECOUNT_VERSION "3.5" +#define PRIMECOUNT_VERSION "3.6" #define PRIMECOUNT_VERSION_MAJOR 3 -#define PRIMECOUNT_VERSION_MINOR 5 +#define PRIMECOUNT_VERSION_MINOR 6 namespace primecount { From f7c9f336def978e8eb8d1cd5854d18ecf99ef209 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Tue, 28 Feb 2017 22:29:09 +0100 Subject: [PATCH 20/37] Update file sizes --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index af9f08a5a..5e711fa11 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,8 @@ These binaries are statically linked and require a CPU which supports the POPCNT instruction (2010 or later). * [primecount-3.6-win64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-win64.zip), 404K -* [primecount-3.6-linux-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-linux-x64.tar.gz), 915K -* [primecount-3.6-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.tar.gz), 688K +* [primecount-3.6-linux-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-linux-x64.tar.gz), 1018K +* [primecount-3.6-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.tar.gz), 897K * Binaries with backup functionality are available [here](https://github.com/kimwalisch/primecount/tree/backup#primecount-backup) Usage examples From 6f286c604f1ab370f47b7af8fd86a465c70f3238 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Wed, 1 Mar 2017 17:52:20 +0100 Subject: [PATCH 21/37] Improved AVX2 check --- CMakeLists.txt | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eafe3441a..19f150b8f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,45 +109,42 @@ if(OPENMP_FOUND) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() -# Use -Werror for compiler checks #################################### - -set(COPY_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -set(CMAKE_CXX_FLAGS -Werror) - # Check -mpopcnt compiler flag ####################################### if(ENABLE_POPCNT) + set(COPY_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + set(CMAKE_CXX_FLAGS -Werror) check_cxx_compiler_flag(-mpopcnt mpopcnt) + set(CMAKE_CXX_FLAGS "${COPY_CXX_FLAGS}") if(mpopcnt) - set(COPY_CXX_FLAGS "${COPY_CXX_FLAGS} -mpopcnt") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") endif() endif() # Check -mpopcntd compiler flag ###################################### if(ENABLE_POPCNT) + set(COPY_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + set(CMAKE_CXX_FLAGS -Werror) check_cxx_compiler_flag(-mpopcntd mpopcntd) + set(CMAKE_CXX_FLAGS "${COPY_CXX_FLAGS}") if(mpopcntd) - set(COPY_CXX_FLAGS "${COPY_CXX_FLAGS} -mpopcntd") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcntd") endif() endif() -# Restore CMAKE_CXX_FLAGS ############################################ - -set(CMAKE_CXX_FLAGS "${COPY_CXX_FLAGS}") - -# Check target("avx2") ############################################### +# Check AVX2 (x86_64) compiler support ############################## check_cxx_source_compiles(" - __attribute__ ((target (\"default\"))) - int func() { return 0; } + #include __attribute__ ((target (\"avx2\"))) - int func() { return 1; } - int main() { return func(); }" - target_avx2) + void func() { __m256i v = _mm256_setzero_si256(); } + int x = __builtin_cpu_supports(\"avx2\"); + int main() { return 0; }" + have_avx2) -if(target_avx2) - add_definitions(-DHAVE_TARGET_AVX2) +if(have_avx2) + add_definitions(-DHAVE_AVX2) endif() # Check if int128_t type exists ###################################### From 1b64c39d48650a5b378abbc91ecf9bf0ff0d538d Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Wed, 1 Mar 2017 18:03:01 +0100 Subject: [PATCH 22/37] Get rid of GCC Function Multiversioning GCC Function Multiversioning only works on Linux. The new code uses __builtin_cpu_supports() and works on Windows, Linux and macOS with GCC. Clang will likely also work in near future. --- src/BitSieve-popcnt.cpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index 99a92f22b..97799af7c 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -51,7 +51,7 @@ uint64_t popcnt(const uint64_t* data, uint64_t size) } // namespace POPCNT } // namespace -#if defined(HAVE_TARGET_AVX2) +#if defined(HAVE_AVX2) #include @@ -179,24 +179,21 @@ uint64_t popcnt(const uint64_t* data, uint64_t size) #endif -#if defined(HAVE_TARGET_AVX2) - namespace { -__attribute__ ((target ("default"))) -uint64_t popcnt(const uint64_t* data, uint64_t size) -{ - return POPCNT::popcnt(data, size); -} +#if defined(HAVE_AVX2) + +// calls CPUID at program startup +const int avx2 = __builtin_cpu_supports("avx2"); -__attribute__ ((target ("avx2"))) uint64_t popcnt(const uint64_t* data, uint64_t size) { - return AVX2::popcnt(data, size); + if (avx2) + return AVX2::popcnt(data, size); + else + return POPCNT::popcnt(data, size); } -} // namespace - #else uint64_t popcnt(const uint64_t* data, uint64_t size) @@ -206,6 +203,8 @@ uint64_t popcnt(const uint64_t* data, uint64_t size) #endif +} // namespace + namespace primecount { /// Count the number of 1 bits inside [start, stop] From e121431306355e2150b5b4fb2bc07d9c590a0fab Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Wed, 1 Mar 2017 18:06:50 +0100 Subject: [PATCH 23/37] Test all branches --- appveyor.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 500475c8f..50cbf8b37 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -3,10 +3,6 @@ version: 1.0.{build} -branches: - only: - - master - platform: - x86 - x64 From 9f606ac4b53e77db3764af4a0f89c842c62cdb4d Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Wed, 1 Mar 2017 18:23:08 +0100 Subject: [PATCH 24/37] Rename AVX2 test --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 19f150b8f..91dc32214 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,9 +141,9 @@ check_cxx_source_compiles(" void func() { __m256i v = _mm256_setzero_si256(); } int x = __builtin_cpu_supports(\"avx2\"); int main() { return 0; }" - have_avx2) + AVX2) -if(have_avx2) +if(AVX2) add_definitions(-DHAVE_AVX2) endif() From fa507c6ca90f8f0ec11157281458b62b13145a75 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Wed, 1 Mar 2017 22:45:15 +0100 Subject: [PATCH 25/37] Fix AVX2 detection --- src/BitSieve-popcnt.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index 97799af7c..3bb1db67d 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -183,12 +183,21 @@ namespace { #if defined(HAVE_AVX2) -// calls CPUID at program startup -const int avx2 = __builtin_cpu_supports("avx2"); +struct CpuInfo +{ + int has_avx2; + CpuInfo() + { + __builtin_cpu_init(); + has_avx2 = __builtin_cpu_supports("avx2"); + } +}; + +const CpuInfo cpuInfo; uint64_t popcnt(const uint64_t* data, uint64_t size) { - if (avx2) + if (cpuInfo.has_avx2) return AVX2::popcnt(data, size); else return POPCNT::popcnt(data, size); From ef1302be1a13332279e1b75c55b255aa8a229367 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Wed, 1 Mar 2017 22:58:00 +0100 Subject: [PATCH 26/37] Refactor PhiTiny code --- include/PhiTiny.hpp | 6 +++--- src/PhiTiny.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/PhiTiny.hpp b/include/PhiTiny.hpp index cd01d9b84..d721b8246 100644 --- a/include/PhiTiny.hpp +++ b/include/PhiTiny.hpp @@ -1,7 +1,7 @@ /// /// @file PhiTiny.hpp /// -/// Copyright (C) 2016 Kim Walisch, +/// Copyright (C) 2017 Kim Walisch, /// /// This file is distributed under the BSD License. See the COPYING /// file in the top level directory. @@ -56,6 +56,8 @@ class PhiTiny { static const int totients[7]; }; +extern const PhiTiny phiTiny; + inline bool is_phi_tiny(int64_t a) { return PhiTiny::is_tiny(a); @@ -66,7 +68,6 @@ inline bool is_phi_tiny(int64_t a) template typename prt::make_signed::type phi_tiny(X x, A a) { - extern const PhiTiny phiTiny; return phiTiny.phi(x, a); } @@ -75,7 +76,6 @@ typename prt::make_signed::type phi_tiny(X x, A a) template X phi_tiny(X x, A a) { - extern const PhiTiny phiTiny; return phiTiny.phi(x, a); } diff --git a/src/PhiTiny.cpp b/src/PhiTiny.cpp index 5c009a021..b1edcc8bc 100644 --- a/src/PhiTiny.cpp +++ b/src/PhiTiny.cpp @@ -7,7 +7,7 @@ /// phi(x, a) = (x / pp) * φ(pp) + phi(x % pp, a) /// with pp = 2 * 3 * ... * prime[a] /// -/// Copyright (C) 2016 Kim Walisch, +/// Copyright (C) 2017 Kim Walisch, /// /// This file is distributed under the BSD License. See the COPYING /// file in the top level directory. @@ -21,7 +21,7 @@ namespace primecount { -extern const PhiTiny phiTiny = PhiTiny(); +const PhiTiny phiTiny; const int PhiTiny::pi[20] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 8 }; From 409075d98b3d682fffb98330189f48e33dbceb76 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Thu, 2 Mar 2017 11:23:40 +0100 Subject: [PATCH 27/37] Refactor CPU detection code --- src/BitSieve-popcnt.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index 3bb1db67d..e93f8010b 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -183,21 +183,11 @@ namespace { #if defined(HAVE_AVX2) -struct CpuInfo -{ - int has_avx2; - CpuInfo() - { - __builtin_cpu_init(); - has_avx2 = __builtin_cpu_supports("avx2"); - } -}; - -const CpuInfo cpuInfo; - uint64_t popcnt(const uint64_t* data, uint64_t size) { - if (cpuInfo.has_avx2) + static const int has_avx2 = __builtin_cpu_supports("avx2"); + + if (has_avx2) return AVX2::popcnt(data, size); else return POPCNT::popcnt(data, size); From 7b2d62e95585626f5c6fc8451f519d3ce9eaaac2 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Thu, 2 Mar 2017 17:25:27 +0100 Subject: [PATCH 28/37] Refactor --- src/BitSieve-popcnt.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index e93f8010b..3d55d21a5 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -28,9 +28,13 @@ namespace POPCNT { /// uint64_t popcnt(const uint64_t* data, uint64_t size) { - uint64_t sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0; - uint64_t limit = size - size % 4; + uint64_t sum0 = 0; + uint64_t sum1 = 0; + uint64_t sum2 = 0; + uint64_t sum3 = 0; + uint64_t i = 0; + uint64_t limit = size - size % 4; for (; i < limit; i += 4) { @@ -64,7 +68,7 @@ __m256i popcnt(const __m256i v) __m256i m1 = _mm256_set1_epi8(0x55); __m256i m2 = _mm256_set1_epi8(0x33); __m256i m4 = _mm256_set1_epi8(0x0F); - __m256i t1 = _mm256_sub_epi8(v, (_mm256_srli_epi16(v, 1) & m1)); + __m256i t1 = _mm256_sub_epi8(v, (_mm256_srli_epi16(v, 1) & m1)); __m256i t2 = _mm256_add_epi8(t1 & m2, (_mm256_srli_epi16(t1, 2) & m2)); __m256i t3 = _mm256_add_epi8(t2, _mm256_srli_epi16(t2, 4)) & m4; @@ -96,8 +100,8 @@ uint64_t popcnt(const __m256i* data, uint64_t size) __m256i sixteens = _mm256_setzero_si256(); __m256i twosA, twosB, foursA, foursB, eightsA, eightsB; - uint64_t limit = size - size % 16; uint64_t i = 0; + uint64_t limit = size - size % 16; for(; i < limit; i += 16) { @@ -139,14 +143,14 @@ uint64_t popcnt(const __m256i* data, uint64_t size) /// Align memory to 32 bytes boundary void align(const uint64_t*& p, - uint64_t* size, - uint64_t* total) + uint64_t& size, + uint64_t& total) { for (; (uintptr_t) p % 32 != 0; p++) { - assert(*size > 0); - *total += popcnt64(*p); - *size -= 1; + assert(size > 0); + total += popcnt64(*p); + size -= 1; } } @@ -162,7 +166,7 @@ uint64_t popcnt(const uint64_t* data, uint64_t size) // for array sizes >= 1 kilobyte if (size * 8 >= 1024) { - align(data, &size, &total); + align(data, size, total); total += popcnt((const __m256i*) data, size / 4); data += size - size % 4; size = size % 4; From 49a786e83710b4e6e6dd0a074209cfc3375b076b Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Thu, 2 Mar 2017 20:26:17 +0100 Subject: [PATCH 29/37] Fix time measuring When OpenMP was disabled the elapsed time was not accurate. --- src/primecount.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/primecount.cpp b/src/primecount.cpp index 6f139e047..6a78e8755 100644 --- a/src/primecount.cpp +++ b/src/primecount.cpp @@ -275,7 +275,7 @@ double get_wtime() #ifdef _OPENMP return omp_get_wtime(); #else - return (double) (std::clock() / CLOCKS_PER_SEC); + return (double) std::clock() / CLOCKS_PER_SEC; #endif } From 18e97d8880519508f2436e13fecb2d14499444a3 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Thu, 2 Mar 2017 20:53:17 +0100 Subject: [PATCH 30/37] Update file sizes --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5e711fa11..3bd68a92c 100644 --- a/README.md +++ b/README.md @@ -30,9 +30,9 @@ Below are the latest precompiled binaries for Windows 64-bit, Linux and OS X. These binaries are statically linked and require a CPU which supports the POPCNT instruction (2010 or later). -* [primecount-3.6-win64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-win64.zip), 404K +* [primecount-3.6-win64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-win64.zip), 400K * [primecount-3.6-linux-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-linux-x64.tar.gz), 1018K -* [primecount-3.6-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.tar.gz), 897K +* [primecount-3.6-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.tar.gz), 922K * Binaries with backup functionality are available [here](https://github.com/kimwalisch/primecount/tree/backup#primecount-backup) Usage examples From 8a469d2014a2a656697bce2992890e54e3987652 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Thu, 2 Mar 2017 21:37:04 +0100 Subject: [PATCH 31/37] Describe AVX2 usage --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3bd68a92c..419bf411c 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,10 @@ used to compute several world records e.g. Binaries -------- -Below are the latest precompiled binaries for Windows 64-bit, Linux and OS X. +Below are the latest primecount binaries for Windows 64-bit, Linux and macOS. These binaries are statically linked and require a CPU which supports the POPCNT -instruction (2010 or later). +instruction (2010 or later). primecount also uses the AVX2 instruction set (if +available) to speed up the computation of the hard special leaves. * [primecount-3.6-win64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-win64.zip), 400K * [primecount-3.6-linux-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-linux-x64.tar.gz), 1018K From f90dddfa2b0513f30ada60a520107b56850feb6e Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Thu, 2 Mar 2017 22:01:59 +0100 Subject: [PATCH 32/37] Changes in version 3.6 --- ChangeLog | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 11f0975e6..c59273394 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,6 @@ -2017-02-27 Kim Walisch +2017-03-02 Kim Walisch - Version 3.6 work in progress. + Version 3.6 released. This version features a new AVX2 popcount algorithm which computes the hard special leaves up to 15% faster on x86 CPUs @@ -8,6 +8,7 @@ * BitSieve-popcnt.cpp: New AVX2 popcount algorithm. * popcnt.hpp: Fix clang performance bug. + * primecount.cpp: Fix clang time measuring. * CMakeLists.txt: Add AVX2 check. 2016-12-16 Kim Walisch From fa33c9b0eda8d36f39d80a02ca01b4392d554595 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Thu, 2 Mar 2017 23:12:21 +0100 Subject: [PATCH 33/37] Update file sizes --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 419bf411c..194507234 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,9 @@ These binaries are statically linked and require a CPU which supports the POPCNT instruction (2010 or later). primecount also uses the AVX2 instruction set (if available) to speed up the computation of the hard special leaves. -* [primecount-3.6-win64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-win64.zip), 400K -* [primecount-3.6-linux-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-linux-x64.tar.gz), 1018K -* [primecount-3.6-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.tar.gz), 922K +* [primecount-3.6-win64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-win64.zip), 400 KB +* [primecount-3.6-linux-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-linux-x64.tar.gz), 1 MB +* [primecount-3.6-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.tar.gz), 922 KB * Binaries with backup functionality are available [here](https://github.com/kimwalisch/primecount/tree/backup#primecount-backup) Usage examples From ca82262412bbf6fff9a67b120befa0e313064b0a Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Sat, 4 Mar 2017 14:21:49 +0100 Subject: [PATCH 34/37] Improved AVX2 popcount --- src/BitSieve-popcnt.cpp | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/src/BitSieve-popcnt.cpp b/src/BitSieve-popcnt.cpp index 3d55d21a5..71b9d3ff4 100644 --- a/src/BitSieve-popcnt.cpp +++ b/src/BitSieve-popcnt.cpp @@ -62,19 +62,6 @@ uint64_t popcnt(const uint64_t* data, uint64_t size) namespace { namespace AVX2 { -__attribute__ ((target ("avx2"))) -__m256i popcnt(const __m256i v) -{ - __m256i m1 = _mm256_set1_epi8(0x55); - __m256i m2 = _mm256_set1_epi8(0x33); - __m256i m4 = _mm256_set1_epi8(0x0F); - __m256i t1 = _mm256_sub_epi8(v, (_mm256_srli_epi16(v, 1) & m1)); - __m256i t2 = _mm256_add_epi8(t1 & m2, (_mm256_srli_epi16(t1, 2) & m2)); - __m256i t3 = _mm256_add_epi8(t2, _mm256_srli_epi16(t2, 4)) & m4; - - return _mm256_sad_epu8(t3, _mm256_setzero_si256()); -} - __attribute__ ((target ("avx2"))) void CSA(__m256i& h, __m256i& l, __m256i a, __m256i b, __m256i c) { @@ -83,6 +70,32 @@ void CSA(__m256i& h, __m256i& l, __m256i a, __m256i b, __m256i c) l = u ^ c; } +__attribute__ ((target ("avx2"))) +__m256i popcnt(__m256i v) +{ + __m256i lookup1 = _mm256_setr_epi8( + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8 + ); + + __m256i lookup2 = _mm256_setr_epi8( + 4, 3, 3, 2, 3, 2, 2, 1, + 3, 2, 2, 1, 2, 1, 1, 0, + 4, 3, 3, 2, 3, 2, 2, 1, + 3, 2, 2, 1, 2, 1, 1, 0 + ); + + __m256i low_mask = _mm256_set1_epi8(0x0f); + __m256i lo = v & low_mask; + __m256i hi = _mm256_srli_epi16(v, 4) & low_mask; + __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo); + __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi); + + return _mm256_sad_epu8(popcnt1, popcnt2); +} + /// AVX2 Harley-Seal popcount (4th iteration). /// The algorithm is based on the paper "Faster Population Counts /// using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and @@ -163,7 +176,7 @@ uint64_t popcnt(const uint64_t* data, uint64_t size) uint64_t total = 0; // AVX2 popcount is faster than POPCNT - // for array sizes >= 1 kilobyte + // for array sizes >= 1 KB if (size * 8 >= 1024) { align(data, size, total); From ad888114f22db116d2c9e0da80d589d7ab36a537 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Sat, 4 Mar 2017 15:22:08 +0100 Subject: [PATCH 35/37] Update macOS file size --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 194507234..e69c8584c 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ available) to speed up the computation of the hard special leaves. * [primecount-3.6-win64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-win64.zip), 400 KB * [primecount-3.6-linux-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-linux-x64.tar.gz), 1 MB -* [primecount-3.6-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.tar.gz), 922 KB +* [primecount-3.6-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.tar.gz), 899 KB * Binaries with backup functionality are available [here](https://github.com/kimwalisch/primecount/tree/backup#primecount-backup) Usage examples From 0af058008f324461128048e6d61ea87230c30717 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Sat, 4 Mar 2017 15:24:14 +0100 Subject: [PATCH 36/37] Update macOS file extension --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e69c8584c..ccdfe5dc6 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ available) to speed up the computation of the hard special leaves. * [primecount-3.6-win64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-win64.zip), 400 KB * [primecount-3.6-linux-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-linux-x64.tar.gz), 1 MB -* [primecount-3.6-macOS-x64.tar.gz](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.tar.gz), 899 KB +* [primecount-3.6-macOS-x64.zip](https://dl.bintray.com/kimwalisch/primecount/primecount-3.6-macOS-x64.zip), 899 KB * Binaries with backup functionality are available [here](https://github.com/kimwalisch/primecount/tree/backup#primecount-backup) Usage examples From 8f082a3bb133b93b7c13cb560de7189ab8584662 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Sat, 4 Mar 2017 15:26:44 +0100 Subject: [PATCH 37/37] Changes in version 3.6 --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index c59273394..3600ffdd5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -2017-03-02 Kim Walisch +2017-03-04 Kim Walisch Version 3.6 released.