diff --git a/src/low/curve2251-sse/CMakeLists.txt b/src/low/curve2251-sse/CMakeLists.txt new file mode 100644 index 000000000..0558cc2e4 --- /dev/null +++ b/src/low/curve2251-sse/CMakeLists.txt @@ -0,0 +1 @@ +set(ARITH_LIBS "gmp") \ No newline at end of file diff --git a/src/low/curve2251-sse/macros.h b/src/low/curve2251-sse/macros.h new file mode 100755 index 000000000..d6a163919 --- /dev/null +++ b/src/low/curve2251-sse/macros.h @@ -0,0 +1,93 @@ +/* + * Copyright 2007 Project RELIC + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file. + * + * RELIC is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Useful macros for binary field arithmetic. + * + * @version $Id$ + * @ingroup fb + */ + +#define PSHUFB(A, B) _mm_shuffle_epi8(A, B); +#define SHL64(A, B) _mm_slli_epi64(A, B) +#define SHR64(A, B) _mm_srli_epi64(A, B) +#define XOR(A, B) _mm_xor_si128(A, B) +#define SHL8(A, B) _mm_slli_si128(A, B) +#define SHR8(A, B) _mm_srli_si128(A, B) +#define AND(A, B) _mm_and_si128(A, B) + +#define MUL(ma, mb) \ + t0 = _mm_clmulepi64_si128(ma, mb, 0x00);\ + t1 = _mm_clmulepi64_si128(ma, mb, 0x11);\ + t2 = XOR(SHR8(ma, 8), ma);\ + t3 = XOR(SHR8(mb, 8), mb);\ + t2 = _mm_clmulepi64_si128(t2, t3, 0x00);\ + t2 = XOR(t2, t0);\ + t2 = XOR(t2, t1);\ + t3 = SHR8(t2, 8);\ + t2 = SHL8(t2, 8);\ + t0 = XOR(t0, t2);\ + t1 = XOR(t1, t3);\ + +#define MULDXS(ma, mb) \ + t0 = _mm_clmulepi64_si128(ma, mb, 0x00);\ + t2 = _mm_clmulepi64_si128(ma, mb, 0x01);\ + t1 = SHR8(t2, 8);\ + t2 = SHL8(t2, 8);\ + t0 = XOR(t0, t2);\ + +#define MULSXD(ma, mb) \ + MULDXS(mb, ma) + +#define RED251(t,m1,m0)\ + t0 = _mm_slli_si128(t,8);\ + t1 = _mm_srli_si128(t,8);\ + m1 = _mm_xor_si128(m1,_mm_srli_epi64(t1,59));\ + m1 = _mm_xor_si128(m1,_mm_srli_epi64(t1,57));\ + m1 = _mm_xor_si128(m1,_mm_srli_epi64(t1,55));\ + m1 = _mm_xor_si128(m1,_mm_srli_epi64(t1,52));\ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(t0,59));\ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(t0,57));\ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(t0,55));\ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(t0,52));\ + t0 = _mm_srli_si128(t0,8);\ + t1 = _mm_slli_si128(t1,8);\ + m0 = _mm_xor_si128(m0,_mm_slli_epi64(t0,5));\ + m0 = _mm_xor_si128(m0,_mm_slli_epi64(t0,7));\ + m0 = _mm_xor_si128(m0,_mm_slli_epi64(t0,9));\ + m0 = _mm_xor_si128(m0,_mm_slli_epi64(t0,12));\ + m0 = _mm_xor_si128(m0,_mm_slli_epi64(t1,5));\ + m0 = _mm_xor_si128(m0,_mm_slli_epi64(t1,7));\ + m0 = _mm_xor_si128(m0,_mm_slli_epi64(t1,9));\ + m0 = _mm_xor_si128(m0,_mm_slli_epi64(t1,12)); + +#define REDUCE() \ + RED251(m3,m2,m1); \ + RED251(m2,m1,m0); \ + m8 = _mm_srli_si128(m1,8); \ + m9 = _mm_srli_epi64(m8,59); \ + m9 = _mm_slli_epi64(m9,59); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,59)); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,57)); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,55)); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,52)); \ + diff --git a/src/low/curve2251-sse/relic_bn_div_low.c b/src/low/curve2251-sse/relic_bn_div_low.c new file mode 100644 index 000000000..44aac9c2a --- /dev/null +++ b/src/low/curve2251-sse/relic_bn_div_low.c @@ -0,0 +1,47 @@ +/* + * RELIC is an Efficient LIbrary for Cryptography + * Copyright (C) 2007-2011 RELIC Authors + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file + * for contact information. + * + * RELIC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level multiple precision division functions. + * + * @version $Id: relic_bn_div_low.c 677 2011-03-05 22:19:43Z dfaranha $ + * @ingroup bn + */ + +#include + +#include "relic_bn.h" +#include "relic_bn_low.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void bn_divn_low(dig_t *c, dig_t *d, dig_t *a, int sa, dig_t *b, int sb) { + mpn_tdiv_qr(c, d, 0, a, sa, b, sb); +} + +void bn_div1_low(dig_t *c, dig_t *d, dig_t *a, int size, dig_t b) { + *d = mpn_divrem_1(c, 0, a, size, b); +} diff --git a/src/low/curve2251-sse/relic_bn_mod_low.c b/src/low/curve2251-sse/relic_bn_mod_low.c new file mode 100644 index 000000000..5b4b4025b --- /dev/null +++ b/src/low/curve2251-sse/relic_bn_mod_low.c @@ -0,0 +1,62 @@ +/* + * RELIC is an Efficient LIbrary for Cryptography + * Copyright (C) 2007-2011 RELIC Authors + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file + * for contact information. + * + * RELIC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level multiple precision integer modular reduction + * functions. + * + * @version $Id: relic_bn_mod_low.c 677 2011-03-05 22:19:43Z dfaranha $ + * @ingroup bn + */ + +#include +#include + +#include "relic_bn.h" +#include "relic_bn_low.h" +#include "relic_util.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void bn_modn_low(dig_t *c, dig_t *a, int sa, dig_t *m, int sm, dig_t u) { + int i; + dig_t r, carry, *tmpc; + + tmpc = c; + + for (i = 0; i < sa; i++, tmpc++, a++) { + *tmpc = *a; + } + + tmpc = c; + + for (i = 0; i < sm; i++, tmpc++) { + r = (dig_t)(*tmpc * u); + carry = mpn_addmul_1(tmpc, m, sm, r); + mpn_add_1(tmpc + sm, tmpc + sm, sm - i + 1, carry); + } + bn_rshd_low(c, c, 2 * sm + 1, sm); +} diff --git a/src/low/curve2251-sse/relic_bn_mul_low.c b/src/low/curve2251-sse/relic_bn_mul_low.c new file mode 100644 index 000000000..502ad438b --- /dev/null +++ b/src/low/curve2251-sse/relic_bn_mul_low.c @@ -0,0 +1,60 @@ +/* + * RELIC is an Efficient LIbrary for Cryptography + * Copyright (C) 2007-2011 RELIC Authors + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file + * for contact information. + * + * RELIC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the multiple precision integer arithmetic multiplication + * functions. + * + * @version $Id: relic_bn_mul_low.c 976 2012-01-07 02:21:45Z dfaranha $ + * @ingroup bn + */ + +#include + +#include "relic_bn.h" +#include "relic_bn_low.h" +#include "relic_util.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +dig_t bn_muladd_low(dig_t *c, dig_t *a, dig_t digit, int size) { + return mpn_addmul_1(c, a, size, digit); +} + +dig_t bn_mul1_low(dig_t *c, dig_t *a, dig_t digit, int size) { + return mpn_mul_1(c, a, size, digit); +} + +void bn_muln_low(dig_t *c, dig_t *a, dig_t *b, int size) { + mpn_mul_n(c, a, b, size); +} + +void bn_muld_low(dig_t *c, dig_t *a, int sizea, dig_t *b, int sizeb, + int low, int high) { + (void) low; + (void) high; + mpn_mul(c, a, sizea, b, sizeb); +} diff --git a/src/low/curve2251-sse/relic_bn_sqr_low.c b/src/low/curve2251-sse/relic_bn_sqr_low.c new file mode 100644 index 000000000..c35bddee8 --- /dev/null +++ b/src/low/curve2251-sse/relic_bn_sqr_low.c @@ -0,0 +1,59 @@ +/* + * RELIC is an Efficient LIbrary for Cryptography + * Copyright (C) 2007-2011 RELIC Authors + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file + * for contact information. + * + * RELIC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the multiple precision integer arithmetic multiplication + * functions. + * + * @version $Id: relic_bn_sqr_low.c 677 2011-03-05 22:19:43Z dfaranha $ + * @ingroup bn + */ + +#include + +#include "relic_bn.h" +#include "relic_bn_low.h" +#include "relic_util.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void bn_sqradd_low(dig_t *c, dig_t *a, int size) { + dig_t carry; + dig_t digit; + + digit = *a; + + carry = mpn_addmul_1(c, a, size, digit); + mpn_add_1(c+size, c+size, size, carry); + if (size - 1 > 0) { + carry = mpn_addmul_1(c+1, a+1, size-1, digit); + mpn_add_1(c+size, c+size, size, carry); + } +} + +void bn_sqrn_low(dig_t *c, dig_t *a, int size) { + mpn_mul_n(c, a, a, size); +} diff --git a/src/low/curve2251-sse/relic_fb_add_low.c b/src/low/curve2251-sse/relic_fb_add_low.c new file mode 100755 index 000000000..941a3f5df --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_add_low.c @@ -0,0 +1,73 @@ +/* + * Copyright 2007 Project RELIC + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file. + * + * RELIC is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level binary field addition and subtraction + * functions. + * + * @version $Id$ + * @ingroup fb + */ + +#include +#include +#ifdef __PCLMUL__ +#include +#endif + +#include +#include "relic_fb.h" +#include "relic_fb_low.h" + +#include "macros.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void fb_add1_low(dig_t *c, dig_t *a, dig_t digit) { + int i; + + (*c) = (*a) ^ digit; + c++; + a++; + for (i = 0; i < FB_DIGS - 1; i++, a++, c++) + (*c) = (*a); +} + +void fb_addn_low(dig_t *c, dig_t *a, dig_t *b) { + *(__m128i *)c = XOR(*(__m128i*)(a), *(__m128i*)(b)); + *(__m128i *)(c + 2) = XOR(*(__m128i*)(a + 2), *(__m128i*)(b + 2)); +} + +void fb_addd_low(dig_t *c, dig_t *a, dig_t *b, int size) { + if (size == 2 * FB_DIGS) { + *(__m128i *)c = XOR(*(__m128i*)(a), *(__m128i*)(b)); + *(__m128i *)(c + 2) = XOR(*(__m128i*)(a + 2), *(__m128i*)(b + 2)); + *(__m128i *)(c + 4) = XOR(*(__m128i*)(a + 4), *(__m128i*)(b + 4)); + *(__m128i *)(c + 6) = XOR(*(__m128i*)(a + 6), *(__m128i*)(b + 6)); + } else { + for (int i = 0; i < size; i++, a++, b++, c++) { + (*c) = (*a) ^ (*b); + } + } +} diff --git a/src/low/curve2251-sse/relic_fb_inv_low.c b/src/low/curve2251-sse/relic_fb_inv_low.c new file mode 100755 index 000000000..70fd73aed --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_inv_low.c @@ -0,0 +1,100 @@ +/* + * RELIC is an Efficient LIbrary for Cryptography + * Copyright (C) 2007-2011 RELIC Authors + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file + * for contact information. + * + * RELIC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level inversion functions. + * + * @version $Id: relic_fb_inv_low.c 553 2010-11-01 22:59:38Z dfaranha $ + * @ingroup fb + */ + +#include +#include "relic_fb.h" +#include "relic_fb_low.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void fb_invn_low(dig_t *c, dig_t *a) { + int i, j, x, y; + int *chain, len; + + chain = fb_poly_get_chain(&len); + + int u[len + 1]; + fb_t table[len + 1]; + for (i = 0; i <= len; i++) { + fb_null(table[i]); + } + + for (i = 0; i <= len; i++) { + fb_new(table[i]); + } + + u[0] = 1; + u[1] = 2; + fb_copy(table[0], a); + fb_sqr(table[1], table[0]); + fb_mul(table[1], table[1], table[0]); + + u[2] = u[1] + u[0]; + fb_sqr(table[2], table[1]); + fb_mul(table[2], table[2], table[0]); + + u[3] = u[2] + u[1]; + fb_sqr(table[3], table[2]); + for (j = 1; j < u[1]; j++) { + fb_sqr(table[3], table[3]); + } + fb_mul(table[3], table[3], table[1]); + + u[4] = 2 * u[3]; + fb_sqr(table[4], table[3]); + for (j = 1; j < u[3]; j++) { + fb_sqr(table[4], table[4]); + } + fb_mul(table[4], table[4], table[3]); + + u[5] = u[4] + u[3]; + fb_sqr(table[5], table[4]); + for (j = 1; j < u[3]; j++) { + fb_sqr(table[5], table[5]); + } + fb_mul(table[5], table[5], table[3]); + + for (i = 6; i <= len; i++) { + x = chain[i - 1] >> 8; + y = chain[i - 1] - (x << 8); + if (x == y) { + u[i] = 2 * u[i - 1]; + } else { + u[i] = u[x] + u[y]; + } + dig_t *tab = (dig_t *)fb_poly_tab_sqr(y); + fb_itr(table[i], table[x], u[y], (void *)tab); + fb_mul(table[i], table[i], table[y]); + } + fb_sqr(c, table[len]); +} diff --git a/src/low/curve2251-sse/relic_fb_itr_low.c b/src/low/curve2251-sse/relic_fb_itr_low.c new file mode 100644 index 000000000..33737d6e0 --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_itr_low.c @@ -0,0 +1,69 @@ +/* + * RELIC is an Efficient LIbrary for Cryptography + * Copyright (C) 2007-2011 RELIC Authors + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file + * for contact information. + * + * RELIC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level iterated squaring/square-root. + * + * @version $Id: relic_fb_sqr_low.c 677 2011-03-05 22:19:43Z dfaranha $ + * @ingroup fb + */ + +#include +#include + +#include "relic_fb.h" +#include "relic_dv.h" +#include "relic_fb_low.h" +#include "relic_util.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void fb_itrn_low(dig_t *c, dig_t *a, dig_t *t) { + int i, j; + dig_t u, *tmp, *p; + + __m128i r0, r1; + r0 = r1 = _mm_setzero_si128(); + for (i = FB_DIGIT - 4; i >= 0; i -= 4) { + tmp = a; + for (j = 0; j < FB_DIGS - 1; j++, tmp++) { + u = (*tmp >> i) & 0x0F; + p = (t + ((j * FB_DIGIT + i) * 4 + u) * FB_DIGS); + r0 = _mm_xor_si128(r0, *(__m128i *)(p)); + r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); + } + } + for (i = FB_DIGIT - 8; i >= 0; i -= 4) { + tmp = a + FB_DIGS - 1; + u = (*tmp >> i) & 0x0F; + p = (t + ((j * FB_DIGIT + i) * 4 + u) * FB_DIGS); + r0 = _mm_xor_si128(r0, *(__m128i *)(p)); + r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); + } + + _mm_store_si128((__m128i *)c, r0); + _mm_store_si128((__m128i *)(c + 2), r1); +} diff --git a/src/low/curve2251-sse/relic_fb_mul_low.c b/src/low/curve2251-sse/relic_fb_mul_low.c new file mode 100755 index 000000000..f91dfdba7 --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_mul_low.c @@ -0,0 +1,122 @@ +/* + * Copyright 2007 Project RELIC + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file. + * + * RELIC is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level binary field bit multiplication functions. + * + * @version $Id$ + * @ingroup fb + */ + +#include +#include + +#include + +#include "relic_fb.h" +#include "relic_fb_low.h" +#include "relic_bn_low.h" +#include "relic_util.h" +#include "macros.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void fb_muld_low(dig_t *c, dig_t *a, dig_t *b, int size) { + dv_t table[16]; + dig_t u, *tmpa, *tmpc, r0, r1, r2, r4, r8; + int i, j; + + dv_zero(c, 2 * size); + + for (i = 0; i < 16; i++) { + dv_new(table[i]); + dv_zero(table[i], size + 1); + } + + u = 0; + for (i = 0; i < size; i++) { + r1 = r0 = b[i]; + r2 = (r0 << 1) | (u >> (FB_DIGIT - 1)); + r4 = (r0 << 2) | (u >> (FB_DIGIT - 2)); + r8 = (r0 << 3) | (u >> (FB_DIGIT - 3)); + table[0][i] = 0; + table[1][i] = r1; + table[2][i] = r2; + table[3][i] = r1 ^ r2; + table[4][i] = r4; + table[5][i] = r1 ^ r4; + table[6][i] = r2 ^ r4; + table[7][i] = r1 ^ r2 ^ r4; + table[8][i] = r8; + table[9][i] = r1 ^ r8; + table[10][i] = r2 ^ r8; + table[11][i] = r1 ^ r2 ^ r8; + table[12][i] = r4 ^ r8; + table[13][i] = r1 ^ r4 ^ r8; + table[14][i] = r2 ^ r4 ^ r8; + table[15][i] = r1 ^ r2 ^ r4 ^ r8; + u = r1; + } + + if (u > 0) { + r2 = u >> (FB_DIGIT - 1); + r4 = u >> (FB_DIGIT - 2); + r8 = u >> (FB_DIGIT - 3); + table[0][size] = table[1][size] = 0; + table[2][size] = table[3][size] = r2; + table[4][size] = table[5][size] = r4; + table[6][size] = table[7][size] = r2 ^ r4; + table[8][size] = table[9][size] = r8; + table[10][size] = table[11][size] = r2 ^ r8; + table[12][size] = table[13][size] = r4 ^ r8; + table[14][size] = table[15][size] = r2 ^ r4 ^ r8; + } + + for (i = FB_DIGIT - 4; i > 0; i -= 4) { + tmpa = a; + tmpc = c; + for (j = 0; j < size; j++, tmpa++, tmpc++) { + u = (*tmpa >> i) & 0x0F; + fb_addd_low(tmpc, tmpc, table[u], size + 1); + } + bn_lshb_low(c, c, 2 * size, 4); + } + for (j = 0; j < size; j++, a++, c++) { + u = *a & 0x0F; + fb_addd_low(c, c, table[u], size + 1); + } + for (i = 0; i < 16; i++) { + dv_free(table[i]); + } +} + +#if defined(__PCLMUL__) || defined(__INTEL_COMPILER) +#include "relic_fb_mul_low_cl.c" +#else +#ifndef SHUFFLE +#include "relic_fb_mul_low_ld.c" +#else +#include "relic_fb_mul_low_sf.c" +#endif +#endif diff --git a/src/low/curve2251-sse/relic_fb_mul_low_cl.c b/src/low/curve2251-sse/relic_fb_mul_low_cl.c new file mode 100755 index 000000000..69c802233 --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_mul_low_cl.c @@ -0,0 +1,257 @@ +/* + * Copyright 2007 Project RELIC + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file. + * + * RELIC is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level binary field bit multiplication functions. + * + * @version $Id$ + * @ingroup fb + */ + +#include + +#include "relic_fb.h" +#include "relic_fb_low.h" +#include "relic_bn_low.h" +#include "relic_util.h" +#include "macros.h" + +#include +#include +#include + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void fb_muln_low(dig_t *c, dig_t *a, dig_t *b) { + __m128i ma0, ma1, mb0, mb1, m0, m1, m2, m3, m4, m5, t0, t1, t2, t3; + + ma0 = _mm_load_si128((__m128i *)a); + mb0 = _mm_load_si128((__m128i *)b); + + MUL(ma0, mb0); + m0 = t0; + m1 = t1; + + ma1 = _mm_load_si128((__m128i *)a + 1); + mb1 = _mm_load_si128((__m128i *)b + 1); + MUL(ma1, mb1); + m2 = t0; + m3 = t1; + + ma0 = XOR(ma0, ma1); + mb0 = XOR(mb0, mb1); + + MUL(ma0, mb0); + m4 = t0; + m5 = t1; + + m4 = _mm_xor_si128(m4, m0); + m5 = _mm_xor_si128(m5, m1); + m4 = _mm_xor_si128(m4, m2); + m5 = _mm_xor_si128(m5, m3); + + m1 = XOR(m1, m4); + m2 = XOR(m2, m5); + + _mm_store_si128((__m128i *) c + 0, m0); + _mm_store_si128((__m128i *) c + 1, m1); + _mm_store_si128((__m128i *) c + 2, m2); + _mm_store_si128((__m128i *) c + 3, m3); + +} + +#if !defined(__INTEL_COMPILER) + +void fb_mulm_low(dig_t *c, dig_t *a, dig_t *b) { + __m128i ma0, ma1, mb0, mb1, m0, m1, m2, m3, m4, m5, t0, t1, t2, t3; + align dig_t t[2*FB_DIGS]; + + ma0 = _mm_load_si128((__m128i *)a); + mb0 = _mm_load_si128((__m128i *)b); + + MUL(ma0, mb0); + m0 = t0; + m1 = t1; + + ma1 = _mm_load_si128((__m128i *)a + 1); + mb1 = _mm_load_si128((__m128i *)b + 1); + MUL(ma1, mb1); + m2 = t0; + m3 = t1; + + ma0 = XOR(ma0, ma1); + mb0 = XOR(mb0, mb1); + + MUL(ma0, mb0); + m4 = t0; + m5 = t1; + + m4 = _mm_xor_si128(m4, m0); + m5 = _mm_xor_si128(m5, m1); + m4 = _mm_xor_si128(m4, m2); + m5 = _mm_xor_si128(m5, m3); + + m1 = XOR(m1, m4); + m2 = XOR(m2, m5); + + _mm_store_si128((__m128i *) t + 0, m0); + _mm_store_si128((__m128i *) t + 1, m1); + _mm_store_si128((__m128i *) t + 2, m2); + _mm_store_si128((__m128i *) t + 3, m3); + + const int ra = 52; + const int rb = 55; + const int rc = 57; + const int rh = 59; + const int lh = 5; + const int la = 12; + const int lb = 9; + const int lc = 7; + + dig_t d = t[7], a0 = t[0], a1 = t[1], a2 = t[2], a3 = t[3], a4 = t[4]; + + a4 ^= (d >> rh); + a4 ^= (d >> ra); + a4 ^= (d >> rb); + a4 ^= (d >> rc); + + a3 ^= (d << lh); + a3 ^= (d << la); + a3 ^= (d << lb); + a3 ^= (d << lc); + + d = t[6]; + a3 ^= (d >> rh); + a3 ^= (d >> ra); + a3 ^= (d >> rb); + a3 ^= (d >> rc); + + a2 ^= (d << lh); + a2 ^= (d << la); + a2 ^= (d << lb); + a2 ^= (d << lc); + + d = t[5]; + a2 ^= (d >> rh); + a2 ^= (d >> ra); + a2 ^= (d >> rb); + a2 ^= (d >> rc); + + a1 ^= (d << lh); + a1 ^= (d << la); + a1 ^= (d << lb); + a1 ^= (d << lc); + + d = a4; + a1 ^= (d >> rh); + a1 ^= (d >> ra); + a1 ^= (d >> rb); + a1 ^= (d >> rc); + + a0 ^= (d << lh); + a0 ^= (d << la); + a0 ^= (d << lb); + a0 ^= (d << lc); + + d = a3 >> rh; + a0 ^= d; + d <<= rh; + + a0 ^= (d >> ra); + a0 ^= (d >> rb); + a0 ^= (d >> rc); + a3 ^= d; + + c[3] = a3; + c[2] = a2; + c[1] = a1; + c[0] = a0; + + return; +} + +#else + +void fb_mulm_low(dig_t *c, dig_t *a, dig_t *b) { + __m128i ma0, ma1, mb0, mb1, m0, m1, m2, m3, m4, m5, m8, m9, t0, t1, t2, t3; + + ma0 = _mm_load_si128((__m128i *)a); + mb0 = _mm_load_si128((__m128i *)b); + + MUL(ma0, mb0); + m0 = t0; + m1 = t1; + + ma1 = _mm_load_si128((__m128i *)a + 1); + mb1 = _mm_load_si128((__m128i *)b + 1); + MUL(ma1, mb1); + m2 = t0; + m3 = t1; + + ma0 = XOR(ma0, ma1); + mb0 = XOR(mb0, mb1); + + MUL(ma0, mb0); + m4 = t0; + m5 = t1; + + m4 = _mm_xor_si128(m4, m0); + m5 = _mm_xor_si128(m5, m1); + m4 = _mm_xor_si128(m4, m2); + m5 = _mm_xor_si128(m5, m3); + + m1 = XOR(m1, m4); + m2 = XOR(m2, m5); + + align dig_t _x[2]; + + REDUCE(); + _mm_store_si128((__m128i *) c + 0, m0); + _mm_store_si128((__m128i *) _x, m1); + c[2] = _x[0]; + c[3] = _x[1] & 0x07FFFFFFFFFFFFFF; + return; +} + +#endif + +void fb_mul1_low(dig_t *c, dig_t *a, dig_t digit) { + __m128i ma, mb, m0, m1, m2, t0, t1, t2; + + ma = _mm_load_si128((__m128i *)a); + mb = _mm_set_epi32(0, 0, digit >> 32, digit & 0xFFFFFFFF); + + MULDXS(ma, mb); + m0 = t0; + m1 = t1; + + ma = _mm_load_si128((__m128i *)a + 1); + MULDXS(ma, mb); + m1 = XOR(m1, t0); + m2 = t1; + + _mm_store_si128((__m128i *) c + 0, m0); + _mm_store_si128((__m128i *) c + 1, m1); + _mm_store_si128((__m128i *) c + 2, m2); +} diff --git a/src/low/curve2251-sse/relic_fb_mul_low_ld.c b/src/low/curve2251-sse/relic_fb_mul_low_ld.c new file mode 100755 index 000000000..15420960f --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_mul_low_ld.c @@ -0,0 +1,483 @@ +/* + * Copyright 2007 Project RELIC + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file. + * + * RELIC is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level binary field bit multiplication functions. + * + * @version $Id$ + * @ingroup fb + */ + +#include + +#include "relic_fb.h" +#include "relic_fb_low.h" +#include "relic_bn_low.h" +#include "relic_util.h" +#include "macros.h" + +#include +#include + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void fb_mul1_low(dig_t *c, dig_t *a, dig_t digit) { + int j, k; + dig_t b1, b2; + + if (digit == 0) { + dv_zero(c, FB_DIGS + 1); + return; + } + if (digit == 1) { + fb_copy(c, a); + return; + } + c[FB_DIGS] = fb_lshb_low(c, a, util_bits_dig(digit) - 1); + for (int i = util_bits_dig(digit) - 2; i > 0; i--) { + if (digit & ((dig_t)1 << i)) { + j = FB_DIGIT - i; + b1 = a[0]; + c[0] ^= (b1 << i); + for (k = 1; k < FB_DIGS; k++) { + b2 = a[k]; + c[k] ^= ((b2 << i) | (b1 >> j)); + b1 = b2; + } + c[FB_DIGS] ^= (b1 >> j); + } + } + if (digit & (dig_t)1) { + fb_add(c, c, a); + } +} + +void fb_muln_low(dig_t *c, dig_t *a, dig_t *b) { + __m128i tab[16][8], tab1[16][8]; + __m128i s0, m0, m1, m2, m3, m4, m8, m9; + char ta, tb; + int i, j, k; + +#define LOOKUP(i, T)\ + T[0][i] = _mm_setzero_si128();\ + T[1][i] = m0;\ + T[2][i] = m1;\ + T[3][i] = m9=_mm_xor_si128(m0,m1);\ + T[4][i] = m2;\ + T[5][i] = _mm_xor_si128(m2,m0);\ + T[6][i] = _mm_xor_si128(m2,m1);\ + T[7][i] = _mm_xor_si128(m2,m9);\ + T[8][i] = m3;\ + T[9][i] = _mm_xor_si128(m3,m0);\ + T[10][i] = _mm_xor_si128(m3,m1);\ + T[11][i] = _mm_xor_si128(m3,m9);\ + T[12][i] = m2=_mm_xor_si128(m3,m2);\ + T[13][i] = _mm_xor_si128(m2,m0);\ + T[14][i] = _mm_xor_si128(m2,m1);\ + T[15][i] = _mm_xor_si128(m2,m9); + + s0 = _mm_setzero_si128(); + for (i = 0; i < 2; i++) { + m0 = _mm_load_si128((__m128i *) (b + 2 * i)); + m9 = _mm_srli_epi64(m0, 57); + m8 = _mm_slli_si128(m9, 8); + m8 = _mm_xor_si128(m8, s0); + s0 = _mm_srli_si128(m9, 8); + m1 = _mm_slli_epi64(m0, 1); + m2 = _mm_slli_epi64(m0, 2); + m3 = _mm_slli_epi64(m0, 3); + m3 = _mm_xor_si128(m3, _mm_srli_epi64(m8, 4)); + m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 5)); + m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 6)); + LOOKUP(i, tab); + m4 = m0; + m0 = _mm_slli_epi64(m4, 4); + m1 = _mm_slli_epi64(m4, 5); + m2 = _mm_slli_epi64(m4, 6); + m3 = _mm_slli_epi64(m4, 7); + m3 = _mm_xor_si128(m3, m8); + m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 1)); + m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 2)); + m0 = _mm_xor_si128(m0, _mm_srli_epi64(m8, 3)); + LOOKUP(i, tab1); + } + m3 = s0; + m2 = _mm_srli_epi64(s0, 1); + tab1[0][i] = tab1[1][i] = tab1[2][i] = tab1[3][i] = _mm_setzero_si128(); + tab1[4][i] = tab1[5][i] = tab1[6][i] = tab1[7][i] = m2; + tab1[8][i] = tab1[9][i] = tab1[10][i] = tab1[11][i] = m3; + m2 =_mm_xor_si128(m3,m2); + tab1[12][i] = tab1[13][i] = tab1[14][i] = tab1[15][i] = m2; +#undef LOOKUP + +#define LSHIFT8(m3,m2,m1,m0)\ + m3=_mm_alignr_epi8(m3,m2,15);\ + m2=_mm_alignr_epi8(m2,m1,15);\ + m1=_mm_alignr_epi8(m1,m0,15);\ + m0=_mm_slli_si128(m0,1); + +#define M(m2,m1,m0,ta,tb)\ + m0=_mm_xor_si128(m0, tab[ta&0xf][0]);\ + m1=_mm_xor_si128(m1, tab[ta&0xf][1]);\ + m0=_mm_xor_si128(m0,tab1[tb&0xf][0]);\ + m1=_mm_xor_si128(m1,tab1[tb&0xf][1]);\ + m2=_mm_xor_si128(m2,tab1[tb&0xf][2]);\ + + // Main computation + m0 = m1 = m2 = m3 = _mm_setzero_si128(); + + for (j = 56; j >= 0; j -= 8) { + k = j + 4; + ta = (a[1] >> j); + tb = (a[1] >> k); + M(m2, m1, m0, ta, tb); + ta = (a[3] >> j); + tb = (a[3] >> k); + M(m3, m2, m1, ta, tb); + LSHIFT8(m3, m2, m1, m0); + } + for (j = 56; j >= 8; j -= 8) { + k = j + 4; + ta = (a[0] >> j); + tb = (a[0] >> k); + M(m2, m1, m0, ta, tb); + ta = (a[2] >> j); + tb = (a[2] >> k); + M(m3, m2, m1, ta, tb); + LSHIFT8(m3, m2, m1, m0); + } + ta = a[0]; + tb = (a[0] >> 4); + M(m2, m1, m0, ta, tb); + ta = a[2]; + tb = (a[2] >> 4); + M(m3, m2, m1, ta, tb); + + _mm_store_si128((__m128i *) c + 0, m0); + _mm_store_si128((__m128i *) c + 1, m1); + _mm_store_si128((__m128i *) c + 2, m2); + _mm_store_si128((__m128i *) c + 3, m3); +#undef M +} + +#if !defined(__INTEL_COMPILER) + +void fb_mulm_low(dig_t *c, dig_t *a, dig_t *b) { + __m128i tab[16][8], tab1[16][8]; + __m128i s0, m0, m1, m2, m3, m4, m8, m9; + align dig_t t[2*FB_DIGS]; + char ta, tb; + int i, j, k; + +#define LOOKUP(i, T)\ + T[0][i] = _mm_setzero_si128();\ + T[1][i] = m0;\ + T[2][i] = m1;\ + T[3][i] = m9=_mm_xor_si128(m0,m1);\ + T[4][i] = m2;\ + T[5][i] = _mm_xor_si128(m2,m0);\ + T[6][i] = _mm_xor_si128(m2,m1);\ + T[7][i] = _mm_xor_si128(m2,m9);\ + T[8][i] = m3;\ + T[9][i] = _mm_xor_si128(m3,m0);\ + T[10][i] = _mm_xor_si128(m3,m1);\ + T[11][i] = _mm_xor_si128(m3,m9);\ + T[12][i] = m2=_mm_xor_si128(m3,m2);\ + T[13][i] = _mm_xor_si128(m2,m0);\ + T[14][i] = _mm_xor_si128(m2,m1);\ + T[15][i] = _mm_xor_si128(m2,m9); + + s0 = _mm_setzero_si128(); + for (i = 0; i < 2; i++) { + m0 = _mm_load_si128((__m128i *) (b + 2 * i)); + m9 = _mm_srli_epi64(m0, 57); + m8 = _mm_slli_si128(m9, 8); + m8 = _mm_xor_si128(m8, s0); + s0 = _mm_srli_si128(m9, 8); + m1 = _mm_slli_epi64(m0, 1); + m2 = _mm_slli_epi64(m0, 2); + m3 = _mm_slli_epi64(m0, 3); + m3 = _mm_xor_si128(m3, _mm_srli_epi64(m8, 4)); + m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 5)); + m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 6)); + LOOKUP(i, tab); + m4 = m0; + m0 = _mm_slli_epi64(m4, 4); + m1 = _mm_slli_epi64(m4, 5); + m2 = _mm_slli_epi64(m4, 6); + m3 = _mm_slli_epi64(m4, 7); + m3 = _mm_xor_si128(m3, m8); + m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 1)); + m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 2)); + m0 = _mm_xor_si128(m0, _mm_srli_epi64(m8, 3)); + LOOKUP(i, tab1); + } + m3 = s0; + m2 = _mm_srli_epi64(s0, 1); + m1 = _mm_setzero_si128();//_mm_srli_epi64(m8, 2); + m0 = _mm_setzero_si128();//_mm_srli_epi64(m8, 3); + tab1[0][i] = tab1[1][i] = tab1[2][i] = tab1[3][i] = m9 = _mm_setzero_si128(); + tab1[4][i] = tab1[5][i] = tab1[6][i] = tab1[7][i] = m2; + tab1[8][i] = tab1[9][i] = tab1[10][i] = tab1[11][i] = m3; + m2 =_mm_xor_si128(m3,m2); + tab1[12][i] = tab1[13][i] = tab1[14][i] = tab1[15][i] = m2; +#undef LOOKUP + +#define LSHIFT8(m3,m2,m1,m0)\ + m3=_mm_alignr_epi8(m3,m2,15);\ + m2=_mm_alignr_epi8(m2,m1,15);\ + m1=_mm_alignr_epi8(m1,m0,15);\ + m0=_mm_slli_si128(m0,1); + +#define M(m2,m1,m0,ta,tb)\ + m0=_mm_xor_si128(m0, tab[ta&0xf][0]);\ + m1=_mm_xor_si128(m1, tab[ta&0xf][1]);\ + m0=_mm_xor_si128(m0,tab1[tb&0xf][0]);\ + m1=_mm_xor_si128(m1,tab1[tb&0xf][1]);\ + m2=_mm_xor_si128(m2,tab1[tb&0xf][2]);\ + + // Main computation + m0 = m1 = m2 = m3 = _mm_setzero_si128(); + + for (j = 56; j >= 0; j -= 8) { + k = j + 4; + ta = (a[1] >> j); + tb = (a[1] >> k); + M(m2, m1, m0, ta, tb); + ta = (a[3] >> j); + tb = (a[3] >> k); + M(m3, m2, m1, ta, tb); + LSHIFT8(m3, m2, m1, m0); + } + for (j = 56; j >= 8; j -= 8) { + k = j + 4; + ta = (a[0] >> j); + tb = (a[0] >> k); + M(m2, m1, m0, ta, tb); + ta = (a[2] >> j); + tb = (a[2] >> k); + M(m3, m2, m1, ta, tb); + LSHIFT8(m3, m2, m1, m0); + } + ta = a[0]; + tb = (a[0] >> 4); + M(m2, m1, m0, ta, tb); + ta = a[2]; + tb = (a[2] >> 4); + M(m3, m2, m1, ta, tb); + + _mm_store_si128((__m128i *) t + 0, m0); + _mm_store_si128((__m128i *) t + 1, m1); + _mm_store_si128((__m128i *) t + 2, m2); + _mm_store_si128((__m128i *) t + 3, m3); + + const int ra = 52; + const int rb = 55; + const int rc = 57; + const int rh = 59; + const int lh = 5; + const int la = 12; + const int lb = 9; + const int lc = 7; + + dig_t d = t[7], a0 = t[0], a1 = t[1], a2 = t[2], a3 = t[3], a4 = t[4]; + + a4 ^= (d >> rh); + a4 ^= (d >> ra); + a4 ^= (d >> rb); + a4 ^= (d >> rc); + + a3 ^= (d << lh); + a3 ^= (d << la); + a3 ^= (d << lb); + a3 ^= (d << lc); + + d = t[6]; + a3 ^= (d >> rh); + a3 ^= (d >> ra); + a3 ^= (d >> rb); + a3 ^= (d >> rc); + + a2 ^= (d << lh); + a2 ^= (d << la); + a2 ^= (d << lb); + a2 ^= (d << lc); + + d = t[5]; + a2 ^= (d >> rh); + a2 ^= (d >> ra); + a2 ^= (d >> rb); + a2 ^= (d >> rc); + + a1 ^= (d << lh); + a1 ^= (d << la); + a1 ^= (d << lb); + a1 ^= (d << lc); + + d = a4; + a1 ^= (d >> rh); + a1 ^= (d >> ra); + a1 ^= (d >> rb); + a1 ^= (d >> rc); + + a0 ^= (d << lh); + a0 ^= (d << la); + a0 ^= (d << lb); + a0 ^= (d << lc); + + d = a3 >> rh; + a0 ^= d; + d <<= rh; + + a0 ^= (d >> ra); + a0 ^= (d >> rb); + a0 ^= (d >> rc); + a3 ^= d; + + c[3] = a3; + c[2] = a2; + c[1] = a1; + c[0] = a0; + + return; +} + +#else + +void fb_mulm_low(dig_t *c, dig_t *a, dig_t *b) { + align __m128i tab[16][8], tab1[16][8]; + __m128i s0, m0, m1, m2, m3, m4, m8, m9, t0, t1, t2, *t; + align dig_t x[2]; + char ta, tb; + int i, j, k; + +#define LOOKUP(i, T)\ + T[0][i] = _mm_setzero_si128();\ + T[1][i] = m0;\ + T[2][i] = m1;\ + T[3][i] = m9=_mm_xor_si128(m0,m1);\ + T[4][i] = m2;\ + T[5][i] = _mm_xor_si128(m2,m0);\ + T[6][i] = _mm_xor_si128(m2,m1);\ + T[7][i] = _mm_xor_si128(m2,m9);\ + T[8][i] = m3;\ + T[9][i] = _mm_xor_si128(m3,m0);\ + T[10][i] = _mm_xor_si128(m3,m1);\ + T[11][i] = _mm_xor_si128(m3,m9);\ + T[12][i] = m2=_mm_xor_si128(m3,m2);\ + T[13][i] = _mm_xor_si128(m2,m0);\ + T[14][i] = _mm_xor_si128(m2,m1);\ + T[15][i] = _mm_xor_si128(m2,m9); + + s0 = _mm_setzero_si128(); + for (i = 0; i < 2; i++) { + m0 = _mm_load_si128((__m128i *) (b + 2 * i)); + m9 = _mm_srli_epi64(m0, 57); + m8 = _mm_slli_si128(m9, 8); + m8 = _mm_xor_si128(m8, s0); + s0 = _mm_srli_si128(m9, 8); + m1 = _mm_slli_epi64(m0, 1); + m2 = _mm_slli_epi64(m0, 2); + m3 = _mm_slli_epi64(m0, 3); + m3 = _mm_xor_si128(m3, _mm_srli_epi64(m8, 4)); + m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 5)); + m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 6)); + LOOKUP(i, tab); + m4 = m0; + m0 = _mm_slli_epi64(m4, 4); + m1 = _mm_slli_epi64(m4, 5); + m2 = _mm_slli_epi64(m4, 6); + m3 = _mm_slli_epi64(m4, 7); + m3 = _mm_xor_si128(m3, m8); + m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 1)); + m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 2)); + m0 = _mm_xor_si128(m0, _mm_srli_epi64(m8, 3)); + LOOKUP(i, tab1); + } + m3 = s0; + m2 = _mm_srli_epi64(s0, 1); + m1 = _mm_setzero_si128(); + m0 = _mm_setzero_si128(); + tab1[0][i] = tab1[1][i] = tab1[2][i] = tab1[3][i] = m9 = _mm_setzero_si128(); + tab1[4][i] = tab1[5][i] = tab1[6][i] = tab1[7][i] = m2; + tab1[8][i] = tab1[9][i] = tab1[10][i] = tab1[11][i] = m3; + m2 =_mm_xor_si128(m3,m2); + tab1[12][i] = tab1[13][i] = tab1[14][i] = tab1[15][i] = m2; +#undef LOOKUP + +#define LSHIFT8(m3,m2,m1,m0)\ + m3=_mm_alignr_epi8(m3,m2,15);\ + m2=_mm_alignr_epi8(m2,m1,15);\ + m1=_mm_alignr_epi8(m1,m0,15);\ + m0=_mm_slli_si128(m0,1); + +#define M(m2,m1,m0,ta,tb)\ + ta &= 0x0f; tb &= 0x0f;\ + m0=_mm_xor_si128(m0, tab[ta][0]);\ + m1=_mm_xor_si128(m1, tab[ta][1]);\ + m0=_mm_xor_si128(m0,tab1[tb][0]);\ + m1=_mm_xor_si128(m1,tab1[tb][1]);\ + m2=_mm_xor_si128(m2,tab1[tb][2]);\ + + // Main computation + m0 = m1 = m2 = m3 = _mm_setzero_si128(); + + for (j = 56; j >= 0; j -= 8) { + k = j + 4; + ta = (a[1] >> j); + tb = (a[1] >> k); + M(m2, m1, m0, ta, tb); + ta = (a[3] >> j); + tb = (a[3] >> k); + M(m3, m2, m1, ta, tb); + LSHIFT8(m3, m2, m1, m0); + } + for (j = 56; j >= 8; j -= 8) { + k = j + 4; + ta = (a[0] >> j); + tb = (a[0] >> k); + M(m2, m1, m0, ta, tb); + ta = (a[2] >> j); + tb = (a[2] >> k); + M(m3, m2, m1, ta, tb); + LSHIFT8(m3, m2, m1, m0); + } + ta = a[0]; + tb = (a[0] >> 4); + M(m2, m1, m0, ta, tb); + ta = a[2]; + tb = (a[2] >> 4); + M(m3, m2, m1, ta, tb); + +#undef M + + REDUCE(); + _mm_store_si128((__m128i *) c + 0, m0); + _mm_store_si128((__m128i *) x, m1); + c[2] = x[0]; + c[3] = x[1] & 0x07FFFFFFFFFFFFFF; +#undef M +} + +#endif diff --git a/src/low/curve2251-sse/relic_fb_mul_low_sf.c b/src/low/curve2251-sse/relic_fb_mul_low_sf.c new file mode 100755 index 000000000..f157084bb --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_mul_low_sf.c @@ -0,0 +1,364 @@ +/* + * Copyright 2007 Project RELIC + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file. + * + * RELIC is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level binary field bit multiplication functions. + * + * @version $Id$ + * @ingroup fb + */ + +#include + +#include "relic_fb.h" +#include "relic_fb_low.h" +#include "relic_bn_low.h" +#include "relic_util.h" + +#include +#include +#include "macros.h" + +#define INV(A,B,C,D) D,C,B,A + +const align uint32_t tm[] = { + INV(0x00000000, 0x00000000, 0x00000000, 0x00000000), + INV(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100), + INV(0x1E1C1A18, 0x16141210, 0x0E0C0A08, 0x06040200), + INV(0x11121714, 0x1D1E1B18, 0x090A0F0C, 0x05060300), + INV(0x3C383430, 0x2C282420, 0x1C181410, 0x0C080400), + INV(0x3336393C, 0x27222D28, 0x1B1E1114, 0x0F0A0500), + INV(0x22242E28, 0x3A3C3630, 0x12141E18, 0x0A0C0600), + INV(0x2D2A2324, 0x31363F38, 0x15121B1C, 0x090E0700), + INV(0x78706860, 0x58504840, 0x38302820, 0x18100800), + INV(0x777E656C, 0x535A4148, 0x3F362D24, 0x1B120900), + INV(0x666C7278, 0x4E445A50, 0x363C2228, 0x1E140A00), + INV(0x69627F74, 0x454E5358, 0x313A272C, 0x1D160B00), + INV(0x44485C50, 0x74786C60, 0x24283C30, 0x14180C00), + INV(0x4B46515C, 0x7F726568, 0x232E3934, 0x171A0D00), + INV(0x5A544648, 0x626C7E70, 0x2A243638, 0x121C0E00), + INV(0x555A4B44, 0x69667778, 0x2D22333C, 0x111E0F00), +}; + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void fb_mul1_low(dig_t *c, dig_t *a, dig_t digit) { + int j, k; + dig_t b1, b2; + + if (digit == 0) { + dv_zero(c, FB_DIGS + 1); + return; + } + if (digit == 1) { + fb_copy(c, a); + return; + } + c[FB_DIGS] = fb_lshb_low(c, a, util_bits_dig(digit) - 1); + for (int i = util_bits_dig(digit) - 2; i > 0; i--) { + if (digit & ((dig_t)1 << i)) { + j = FB_DIGIT - i; + b1 = a[0]; + c[0] ^= (b1 << i); + for (k = 1; k < FB_DIGS; k++) { + b2 = a[k]; + c[k] ^= ((b2 << i) | (b1 >> j)); + b1 = b2; + } + c[FB_DIGS] ^= (b1 >> j); + } + } + if (digit & (dig_t)1) { + fb_add(c, c, a); + } +} + +void fb_mulm_low(dig_t * c, dig_t * a, dig_t * b) { + __m128i rl[FB_DIGS], rh[FB_DIGS], l0, l1, h0, h1; + __m128i t0, t1, mask, m[FB_DIGS], m0, m1, m2, m3, m8, m9; + dig_t r0, r1, r2, r3; + int i, j, k, ta; + dig_t x[2]; + + mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); + + /* computes BL, BH and BM */ + h0 = _mm_load_si128((__m128i *) & b[0]); + h1 = _mm_load_si128((__m128i *) & b[2]); + +#define LSHIFT4(m)\ + m1=m[3];\ + t1=_mm_srli_epi64(m1,60);\ + for(j=3;j>0;j--){\ + m1=_mm_slli_epi64(m1,4);\ + t0=_mm_xor_si128(m1,_mm_slli_si128(t1,8));\ + m1=m[j-1];\ + t1=_mm_srli_epi64(m1,60);\ + m[j]=_mm_xor_si128(t0,_mm_srli_si128(t1,8));\ + }\ + m[0]=_mm_slli_epi64(m1,4);\ + m[0]=_mm_xor_si128(m[0],_mm_slli_si128(t1,8)); + +#define LSHIFT8(m4,m3,m2,m1,m0)\ + m3=_mm_alignr_epi8(m3,m2,15);\ + m2=_mm_alignr_epi8(m2,m1,15);\ + m1=_mm_alignr_epi8(m1,m0,15);\ + m0=_mm_slli_si128(m0,1); + +#define LSHIFT8V(m)\ + m[3]=_mm_alignr_epi8(m[3],m[2],15);\ + m[2]=_mm_alignr_epi8(m[2],m[1],15);\ + m[1]=_mm_alignr_epi8(m[1],m[0],15);\ + m[0]=_mm_slli_si128(m[0],1); + +#define M(m1,m0,ta,b0,b1)\ + t0 = _mm_load_si128((__m128i *)tm + ta);\ + m0 = _mm_xor_si128(m0, _mm_shuffle_epi8(t0, b0));\ + m1 = _mm_xor_si128(m1, _mm_shuffle_epi8(t0, b1));\ + +#define MultK(al,b0,b1)\ + m0=m1=m2=m3=_mm_setzero_si128();\ + for(k=56;k>=0; k-=8){\ + ta=(r1>>k)&0x0f;\ + M(m1,m0,ta,b0,b1);\ + ta=(r3>>k)&0x0f;\ + M(m2,m1,ta,b0,b1);\ + LSHIFT8(m4,m3,m2,m1,m0);\ + }\ + for(k=56;k>=8; k-=8){\ + ta=(r0>>k)&0x0f;\ + M(m1,m0,ta,b0,b1);\ + ta=(r2>>k)&0x0f;\ + M(m2,m1,ta,b0,b1);\ + LSHIFT8(m4,m3,m2,m1,m0);\ + }\ + ta=(r0)&0xf; M(m1,m0,ta,b0,b1);\ + ta=(r2)&0xf; M(m2,m1,ta,b0,b1);\ + + l0 = _mm_and_si128(h0, mask); + l1 = _mm_and_si128(h1, mask); + + r0 = a[0]; + r1 = a[1]; + r2 = a[2]; + r3 = a[3]; + + /* AL * BL */ + MultK(a, l0, l1); + + rl[0] = m0; + rl[1] = m1; + rl[2] = m2; + rl[3] = m3; + + h0 = _mm_and_si128(_mm_srli_epi64(h0, 4), mask); + h1 = _mm_and_si128(_mm_srli_epi64(h1, 4), mask); + + r0 >>= 4; + r1 >>= 4; + r2 >>= 4; + r3 >>= 4; + + /* AH * BH */ + MultK(ah, h0, h1); + + rh[0] = m0; + rh[1] = m1; + rh[2] = m2; + rh[3] = m3; + + h0 = _mm_xor_si128(h0, l0); + h1 = _mm_xor_si128(h1, l1); + + r0 ^= a[0]; + r1 ^= a[1]; + r2 ^= a[2]; + r3 ^= a[3]; + + /* AM * BM */ + MultK(am, h0, h1); + + m[0] = m0; + m[1] = m1; + m[2] = m2; + m[3] = m3; + + /* m = m + rh + rl */ + for (i = 0; i < FB_DIGS; i++) { + m[i] = _mm_xor_si128(m[i], rh[i]); + m[i] = _mm_xor_si128(m[i], rl[i]); + } + + /* m= m + x^8 rh + x^4 m + rl */ + + LSHIFT4(m); + LSHIFT8V(rh); + + for (i = 0; i < FB_DIGS; i++) { + m[i] = _mm_xor_si128(m[i], rh[i]); + m[i] = _mm_xor_si128(m[i], rl[i]); + } + + m0 = m[0]; + m1 = m[1]; + m2 = m[2]; + m3 = m[3]; + + REDUCE(); + _mm_store_si128((__m128i *) c + 0, m0); + _mm_store_si128((__m128i *) x, m1); + c[2] = x[0]; + c[3] = x[1] & 0x07FFFFFFFFFFFFFF; +} + +void fb_muln_low(dig_t * c, dig_t * a, dig_t * b) { + __m128i rl[FB_DIGS], rh[FB_DIGS], l0, l1, h0, h1; + __m128i t0, t1, mask, m[FB_DIGS], m0, m1, m2, m3; + dig_t r0, r1, r2, r3; + int i, j, k, ta; + + mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); + + /* computes BL, BH and BM */ + h0 = _mm_load_si128((__m128i *) & b[0]); + h1 = _mm_load_si128((__m128i *) & b[2]); + +#define LSHIFT4(m)\ + m1=m[3];\ + t1=_mm_srli_epi64(m1,60);\ + for(j=3;j>0;j--){\ + m1=_mm_slli_epi64(m1,4);\ + t0=_mm_xor_si128(m1,_mm_slli_si128(t1,8));\ + m1=m[j-1];\ + t1=_mm_srli_epi64(m1,60);\ + m[j]=_mm_xor_si128(t0,_mm_srli_si128(t1,8));\ + }\ + m[0]=_mm_slli_epi64(m1,4);\ + m[0]=_mm_xor_si128(m[0],_mm_slli_si128(t1,8)); + +#define LSHIFT8(m4,m3,m2,m1,m0)\ + m3=_mm_alignr_epi8(m3,m2,15);\ + m2=_mm_alignr_epi8(m2,m1,15);\ + m1=_mm_alignr_epi8(m1,m0,15);\ + m0=_mm_slli_si128(m0,1); + +#define LSHIFT8V(m)\ + m[3]=_mm_alignr_epi8(m[3],m[2],15);\ + m[2]=_mm_alignr_epi8(m[2],m[1],15);\ + m[1]=_mm_alignr_epi8(m[1],m[0],15);\ + m[0]=_mm_slli_si128(m[0],1); + +#define M(m1,m0,ta,b0,b1)\ + t0 = _mm_load_si128((__m128i *)tm + ta);\ + m0 = _mm_xor_si128(m0, _mm_shuffle_epi8(t0, b0));\ + m1 = _mm_xor_si128(m1, _mm_shuffle_epi8(t0, b1));\ + +#define MultK(al,b0,b1)\ + m0=m1=m2=m3=_mm_setzero_si128();\ + for(k=56;k>=0; k-=8){\ + ta=(r1>>k)&0x0f;\ + M(m1,m0,ta,b0,b1);\ + ta=(r3>>k)&0x0f;\ + M(m2,m1,ta,b0,b1);\ + LSHIFT8(m4,m3,m2,m1,m0);\ + }\ + for(k=56;k>=8; k-=8){\ + ta=(r0>>k)&0x0f;\ + M(m1,m0,ta,b0,b1);\ + ta=(r2>>k)&0x0f;\ + M(m2,m1,ta,b0,b1);\ + LSHIFT8(m4,m3,m2,m1,m0);\ + }\ + ta=(r0)&0xf; M(m1,m0,ta,b0,b1);\ + ta=(r2)&0xf; M(m2,m1,ta,b0,b1);\ + + l0 = _mm_and_si128(h0, mask); + l1 = _mm_and_si128(h1, mask); + + r0 = a[0]; + r1 = a[1]; + r2 = a[2]; + r3 = a[3]; + + /* AL * BL */ + MultK(a, l0, l1); + + rl[0] = m0; + rl[1] = m1; + rl[2] = m2; + rl[3] = m3; + + h0 = _mm_and_si128(_mm_srli_epi64(h0, 4), mask); + h1 = _mm_and_si128(_mm_srli_epi64(h1, 4), mask); + + r0 >>= 4; + r1 >>= 4; + r2 >>= 4; + r3 >>= 4; + + /* AH * BH */ + MultK(ah, h0, h1); + + rh[0] = m0; + rh[1] = m1; + rh[2] = m2; + rh[3] = m3; + + h0 = _mm_xor_si128(h0, l0); + h1 = _mm_xor_si128(h1, l1); + + r0 ^= a[0]; + r1 ^= a[1]; + r2 ^= a[2]; + r3 ^= a[3]; + + /* AM * BM */ + MultK(am, h0, h1); + + m[0] = m0; + m[1] = m1; + m[2] = m2; + m[3] = m3; + + /* m = m + rh + rl */ + for (i = 0; i < FB_DIGS; i++) { + m[i] = _mm_xor_si128(m[i], rh[i]); + m[i] = _mm_xor_si128(m[i], rl[i]); + } + + /* m= m + x^8 rh + x^4 m + rl */ + + LSHIFT4(m); + LSHIFT8V(rh); + + for (i = 0; i < FB_DIGS; i++) { + m[i] = _mm_xor_si128(m[i], rh[i]); + m[i] = _mm_xor_si128(m[i], rl[i]); + } + + _mm_store_si128((__m128i *) c + 0, m[0]); + _mm_store_si128((__m128i *) c + 1, m[1]); + _mm_store_si128((__m128i *) c + 2, m[2]); + _mm_store_si128((__m128i *) c + 3, m[3]); +} diff --git a/src/low/curve2251-sse/relic_fb_rdc_low.c b/src/low/curve2251-sse/relic_fb_rdc_low.c new file mode 100755 index 000000000..48879d53b --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_rdc_low.c @@ -0,0 +1,158 @@ +/* + * RELIC is an Efficient LIbrary for Cryptography + * Copyright (C) 2007-2011 RELIC Authors + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file + * for contact information. + * + * RELIC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level modular reduction functions. + * + * @version $Id: relic_fb_rdc_low.c 194 2009-11-28 01:54:32Z dfaranha $ + * @ingroup fb + */ + +#include + +#include "relic_fb.h" +#include "relic_fb_low.h" +#include "relic_util.h" +#include "macros.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void fb_rdcn_low(dig_t *c, dig_t *a) { + dig_t d, a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; + const int ra = 52; + const int rb = 55; + const int rc = 57; + const int rh = 59; + const int lh = 5; + const int la = 12; + const int lb = 9; + const int lc = 7; + + d = a[7]; + a4 ^= (d >> rh); + a4 ^= (d >> ra); + a4 ^= (d >> rb); + a4 ^= (d >> rc); + + a3 ^= (d << lh); + a3 ^= (d << la); + a3 ^= (d << lb); + a3 ^= (d << lc); + + d = a[6]; + a3 ^= (d >> rh); + a3 ^= (d >> ra); + a3 ^= (d >> rb); + a3 ^= (d >> rc); + + a2 ^= (d << lh); + a2 ^= (d << la); + a2 ^= (d << lb); + a2 ^= (d << lc); + + d = a[5]; + a2 ^= (d >> rh); + a2 ^= (d >> ra); + a2 ^= (d >> rb); + a2 ^= (d >> rc); + + a1 ^= (d << lh); + a1 ^= (d << la); + a1 ^= (d << lb); + a1 ^= (d << lc); + + d = a4; + a1 ^= (d >> rh); + a1 ^= (d >> ra); + a1 ^= (d >> rb); + a1 ^= (d >> rc); + + a0 ^= (d << lh); + a0 ^= (d << la); + a0 ^= (d << lb); + a0 ^= (d << lc); + + d = a3 >> rh; + a0 ^= d; + d <<= rh; + + a0 ^= (d >> ra); + a0 ^= (d >> rb); + a0 ^= (d >> rc); + a3 ^= d; + + c[3] = a3; + c[2] = a2; + c[1] = a1; + c[0] = a0; +} + +void fb_rdc1_low(dig_t *c, dig_t *a) { + dig_t d; + const int fa = 7; + const int fb = 4; + const int fc = 2; + + const int rh = FB_BITS % FB_DIGIT; + const int sh = FB_BITS / FB_DIGIT + 1; + const int lh = FB_DIGIT - rh;; + const int ra = (FB_BITS - fa) % FB_DIGIT; + const int sa = (FB_BITS - fa) / FB_DIGIT + 1; + const int la = FB_DIGIT - ra; + const int rb = (FB_BITS - fb) % FB_DIGIT; + const int sb = (FB_BITS - fb) / FB_DIGIT + 1; + const int lb = FB_DIGIT - rb; + const int rc = (FB_BITS - fc) % FB_DIGIT; + const int sc = (FB_BITS - fc) / FB_DIGIT + 1; + const int lc = FB_DIGIT - rc; + + d = a[FB_DIGS]; + + a[FB_DIGS - sh] ^= (d << lh); + a[FB_DIGS - sa + 1] ^= (d >> ra); + a[FB_DIGS - sa] ^= (d << la); + + a[FB_DIGS - sb + 1] ^= (d >> rb); + a[FB_DIGS - sb] ^= (d << lb); + a[FB_DIGS - sc + 1] ^= (d >> rc); + a[FB_DIGS - sc] ^= (d << lc); + + d = a[sh - 1] >> rh; + + dig_t a0 = a[0]; + a0 ^= d; + d <<= rh; + + a0 ^= (d >> ra); + a0 ^= (d >> rb); + a0 ^= (d >> rc); + a[3] ^= d; + + c[0] = a0; + c[1] = a[1]; + c[2] = a[2]; + c[3] = a[3]; +} diff --git a/src/low/curve2251-sse/relic_fb_slv_low.c b/src/low/curve2251-sse/relic_fb_slv_low.c new file mode 100755 index 000000000..4825f527f --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_slv_low.c @@ -0,0 +1,193 @@ +/* + * RELIC is an Efficient LIbrary for Cryptography + * Copyright (C) 2007-2011 RELIC Authors + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file + * for contact information. + * + * RELIC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level binary field half-trace. + * + * @version $Id: relic_fb_slv_low.c 652 2011-02-20 23:50:00Z dfaranha $ + * @ingroup fb + */ + +#include +#include +#ifdef __PCLMUL__ +#include +#endif + +#include + +#include "relic_fb.h" +#include "relic_fb_low.h" +#include "relic_util.h" + +/*============================================================================*/ +/* Private definitions */ +/*============================================================================*/ + +void fb_slvn_low(dig_t *c, dig_t *a) { + int i; + dig_t *p, u0, u1, u2, u3; + void *tab = fb_poly_get_slv(); + __m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm; + + perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200); + mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); + mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0); + mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); + sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100); + sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400); + + t0 = _mm_load_si128((__m128i *)a); + t1 = _mm_load_si128((__m128i *)(a + 2)); + r0 = r1 = _mm_setzero_si128(); + + m0 = _mm_shuffle_epi8(t1, perm); + m1 = _mm_and_si128(m0, mask0); + m2 = _mm_and_si128(m0, mask1); + m2 = _mm_srli_epi64(m2, 4); + m2 = _mm_shuffle_epi8(sqrt1, m2); + m1 = _mm_shuffle_epi8(sqrt0, m1); + m1 = _mm_xor_si128(m1, m2); + + m2 = _mm_slli_si128(m1, 8); + m1 = _mm_and_si128(m1, mask2); + m1 = _mm_slli_epi64(m1, 4); + m1 = _mm_xor_si128(m1, m2); + t0 = _mm_xor_si128(t0, m1); + r0 = _mm_xor_si128(r0, m1); + + m0 = _mm_and_si128(t0, mask2); + m0 = _mm_shuffle_epi8(m0, perm); + m1 = _mm_and_si128(m0, mask0); + m2 = _mm_and_si128(m0, mask1); + m2 = _mm_srli_epi64(m2, 4); + m2 = _mm_shuffle_epi8(sqrt1, m2); + m1 = _mm_shuffle_epi8(sqrt0, m1); + m1 = _mm_xor_si128(m1, m2); + + m2 = _mm_srli_si128(m1, 8); + m1 = _mm_andnot_si128(mask2, m1); + m2 = _mm_slli_epi64(m2, 4); + m1 = _mm_xor_si128(m1, m2); + t0 = _mm_xor_si128(t0, m1); + r0 = _mm_xor_si128(r0, m1); + + m1 = _mm_srli_si128(t0, 4); + m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF)); + m0 = _mm_shuffle_epi8(m1, perm); + m1 = _mm_and_si128(m0, mask0); + m2 = _mm_and_si128(m0, mask1); + m2 = _mm_srli_epi64(m2, 4); + m2 = _mm_shuffle_epi8(sqrt1, m2); + m1 = _mm_shuffle_epi8(sqrt0, m1); + m1 = _mm_xor_si128(m1, m2); + m2 = _mm_slli_si128(m1, 8); + m1 = _mm_slli_epi64(m1, 4); + m1 = _mm_xor_si128(m1, m2); + m1 = _mm_srli_si128(m1, 6); + t0 = _mm_xor_si128(t0, m1); + r0 = _mm_xor_si128(r0, m1); + + m1 = _mm_srli_si128(t0, 2); + m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF)); + m0 = _mm_shuffle_epi8(m1, perm); + m1 = _mm_and_si128(m0, mask0); + m2 = _mm_and_si128(m0, mask1); + m2 = _mm_srli_epi64(m2, 4); + m2 = _mm_shuffle_epi8(sqrt1, m2); + m1 = _mm_shuffle_epi8(sqrt0, m1); + m1 = _mm_xor_si128(m1, m2); + m2 = _mm_slli_si128(m1, 8); + m1 = _mm_slli_epi64(m1, 4); + m1 = _mm_xor_si128(m1, m2); + m1 = _mm_srli_si128(m1, 7); + t0 = _mm_xor_si128(t0, m1); + r0 = _mm_xor_si128(r0, m1); + + m1 = _mm_srli_si128(t0, 1); + m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55)); + m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); + m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33)); + m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2)); + m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F)); + m1 = _mm_slli_epi64(m1, 4); + t0 = _mm_xor_si128(t0, m1); + r0 = _mm_xor_si128(r0, m1); + + m1 = _mm_srli_epi64(t0, 4); + m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5)); + m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); + m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3)); + m1 = _mm_slli_epi64(m1, 2); + t0 = _mm_xor_si128(t0, m1); + r0 = _mm_xor_si128(r0, m1); + + m1 = _mm_srli_epi64(t0, 2); + m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1)); + m1 = _mm_slli_epi64(m1, 1); + t0 = _mm_xor_si128(t0, m1); + r0 = _mm_xor_si128(r0, m1); + + sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000); + sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000); + + m1 = _mm_and_si128(t0, mask0); + m2 = _mm_and_si128(t0, mask1); + m3 = _mm_and_si128(t1, mask0); + m4 = _mm_and_si128(t1, mask1); + m2 = _mm_srli_epi64(m2, 4); + m4 = _mm_srli_epi64(m4, 4); + m2 = _mm_shuffle_epi8(sqrt1, m2); + m1 = _mm_shuffle_epi8(sqrt0, m1); + m4 = _mm_shuffle_epi8(sqrt1, m4); + m3 = _mm_shuffle_epi8(sqrt0, m3); + m1 = _mm_or_si128(m1, m2); + u0 = _mm_extract_epi64(m1, 0); + u1 = _mm_extract_epi64(m1, 1); + m3 = _mm_or_si128(m3, m4); + u2 = _mm_extract_epi64(m3, 0); + u3 = _mm_extract_epi64(m3, 1); + + for (i = 0; i < 8; i++) { + p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st)); + r0 = _mm_xor_si128(r0, *(__m128i *)(p)); + r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); + u0 >>= 8; + p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st)); + r0 = _mm_xor_si128(r0, *(__m128i *)(p)); + r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); + u1 >>= 8; + p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st)); + r0 = _mm_xor_si128(r0, *(__m128i *)(p)); + r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); + u2 >>= 8; + p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st)); + r0 = _mm_xor_si128(r0, *(__m128i *)(p)); + r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); + u3 >>= 8; + } + + _mm_store_si128((__m128i *)c, r0); + _mm_store_si128((__m128i *)(c + 2), r1); +} diff --git a/src/low/curve2251-sse/relic_fb_sqr_low.c b/src/low/curve2251-sse/relic_fb_sqr_low.c new file mode 100755 index 000000000..692793da3 --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_sqr_low.c @@ -0,0 +1,251 @@ +/* + * Copyright 2007 Project RELIC + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file. + * + * RELIC is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level binary field squaring. + * + * @version $Id$ + * @ingroup bn + */ + +#include + +#include "relic_fb.h" +#include "relic_fb_low.h" +#include "relic_util.h" +#include "macros.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +#include + +void fb_sqrn_low(dig_t *c, dig_t *a) { + __m128i m0, t; + int j = 0; + + for (int i = 0; i < FB_DIGS; i++) { + m0 = _mm_set_epi32(0x00000000, a[i] >> 32, 0x00000000, a[i] & 0xFFFFFFFFFFFFFFFF); + t = _mm_slli_epi64(m0, 16); + t = _mm_or_si128(t, m0); + t = _mm_and_si128(t, _mm_set_epi32(0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF)); + m0 = t; + t = _mm_slli_epi64(m0, 8); + t = _mm_or_si128(t, m0); + t = _mm_and_si128(t, _mm_set_epi32(0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF)); + m0 = t; + t = _mm_slli_epi64(m0, 4); + t = _mm_or_si128(t, m0); + t = _mm_and_si128(t, _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F)); + m0 = t; + t = _mm_slli_epi64(m0, 2); + t = _mm_or_si128(t, m0); + t = _mm_and_si128(t, _mm_set_epi32(0x33333333, 0x33333333, 0x33333333, 0x33333333)); + m0 = t; + t = _mm_slli_epi64(m0, 1); + t = _mm_or_si128(t, m0); + t = _mm_and_si128(t, _mm_set_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555)); + _mm_store_si128((__m128i *)(c + j), t); + j += 2; + } +} + +void fb_sqrl_low(dig_t *c, dig_t *a) { + __m128i m0, m1, m2, m3, m4, m5, m6, mask; + __m128i t0; + + t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100); + mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); + + m0 = _mm_load_si128((__m128i *)(a)); + m1 = _mm_and_si128(m0, mask); + m1 = _mm_shuffle_epi8(t0, m1); + m2 = _mm_srli_epi64(m0, 4); + m2 = _mm_and_si128(m2, mask); + m2 = _mm_shuffle_epi8(t0, m2); + m3 = _mm_unpacklo_epi8(m1, m2); + m4 = _mm_unpackhi_epi8(m1, m2); + + m0 = _mm_load_si128((__m128i *)(a+2)); + m1 = _mm_and_si128(m0, mask); + m1 = _mm_shuffle_epi8(t0, m1); + m2 = _mm_srli_epi64(m0, 4); + m2 = _mm_and_si128(m2, mask); + m2 = _mm_shuffle_epi8(t0, m2); + m5 = _mm_unpacklo_epi8(m1, m2); + m6 = _mm_unpackhi_epi8(m1, m2); + + _mm_store_si128((__m128i *)(c + 0), m3); + _mm_store_si128((__m128i *)(c + 2), m4); + _mm_store_si128((__m128i *)(c + 4), m5); + _mm_store_si128((__m128i *)(c + 6), m6); +} + +#if defined(__INTEL_COMPILER) + +void fb_sqrm_low(dig_t *c, dig_t *a) { + __m128i t0, t1, m0, m1, m2, m3, m4, m5, m6, m8, m9, mask; + align dig_t x[2]; + + t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100); + mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); + + m0 = _mm_load_si128((__m128i *)(a)); + m1 = _mm_and_si128(m0, mask); + m1 = _mm_shuffle_epi8(t0, m1); + m2 = _mm_srli_epi64(m0, 4); + m2 = _mm_and_si128(m2, mask); + m2 = _mm_shuffle_epi8(t0, m2); + m3 = _mm_unpacklo_epi8(m1, m2); + m4 = _mm_unpackhi_epi8(m1, m2); + + m0 = _mm_load_si128((__m128i *)(a+2)); + m1 = _mm_and_si128(m0, mask); + m1 = _mm_shuffle_epi8(t0, m1); + m2 = _mm_srli_epi64(m0, 4); + m2 = _mm_and_si128(m2, mask); + m2 = _mm_shuffle_epi8(t0, m2); + m5 = _mm_unpacklo_epi8(m1, m2); + m6 = _mm_unpackhi_epi8(m1, m2); + + m0 = m3; + m1 = m4; + m2 = m5; + m3 = m6; + + REDUCE(); + _mm_store_si128((__m128i *) c + 0, m0); + _mm_store_si128((__m128i *) x, m1); + c[2] = x[0]; + c[3] = x[1] & 0x07FFFFFFFFFFFFFF; +} + +#else + +void fb_sqrm_low(dig_t *c, dig_t *a) { + __m128i t0, m0, m1, m2, m3, m4, m5, m6, mask; + align dig_t t[2*FB_DIGS]; + + t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100); + mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); + + m0 = _mm_load_si128((__m128i *)(a)); + m1 = _mm_and_si128(m0, mask); + m1 = _mm_shuffle_epi8(t0, m1); + m2 = _mm_srli_epi64(m0, 4); + m2 = _mm_and_si128(m2, mask); + m2 = _mm_shuffle_epi8(t0, m2); + m3 = _mm_unpacklo_epi8(m1, m2); + m4 = _mm_unpackhi_epi8(m1, m2); + + m0 = _mm_load_si128((__m128i *)(a+2)); + m1 = _mm_and_si128(m0, mask); + m1 = _mm_shuffle_epi8(t0, m1); + m2 = _mm_srli_epi64(m0, 4); + m2 = _mm_and_si128(m2, mask); + m2 = _mm_shuffle_epi8(t0, m2); + m5 = _mm_unpacklo_epi8(m1, m2); + m6 = _mm_unpackhi_epi8(m1, m2); + + m0 = m3; + m1 = m4; + m2 = m5; + m3 = m6; + + _mm_store_si128((__m128i *) t + 0, m0); + _mm_store_si128((__m128i *) t + 1, m1); + _mm_store_si128((__m128i *) t + 2, m2); + _mm_store_si128((__m128i *) t + 3, m3); + + const int ra = 52; + const int rb = 55; + const int rc = 57; + const int rh = 59; + const int lh = 5; + const int la = 12; + const int lb = 9; + const int lc = 7; + + dig_t d = t[7], a0 = t[0], a1 = t[1], a2 = t[2], a3 = t[3], a4 = t[4]; + + a4 ^= (d >> rh); + a4 ^= (d >> ra); + a4 ^= (d >> rb); + a4 ^= (d >> rc); + + a3 ^= (d << lh); + a3 ^= (d << la); + a3 ^= (d << lb); + a3 ^= (d << lc); + + d = t[6]; + a3 ^= (d >> rh); + a3 ^= (d >> ra); + a3 ^= (d >> rb); + a3 ^= (d >> rc); + + a2 ^= (d << lh); + a2 ^= (d << la); + a2 ^= (d << lb); + a2 ^= (d << lc); + + d = t[5]; + a2 ^= (d >> rh); + a2 ^= (d >> ra); + a2 ^= (d >> rb); + a2 ^= (d >> rc); + + a1 ^= (d << lh); + a1 ^= (d << la); + a1 ^= (d << lb); + a1 ^= (d << lc); + + d = a4; + a1 ^= (d >> rh); + a1 ^= (d >> ra); + a1 ^= (d >> rb); + a1 ^= (d >> rc); + + a0 ^= (d << lh); + a0 ^= (d << la); + a0 ^= (d << lb); + a0 ^= (d << lc); + + d = a3 >> rh; + a0 ^= d; + d <<= rh; + + a0 ^= (d >> ra); + a0 ^= (d >> rb); + a0 ^= (d >> rc); + a3 ^= d; + + c[3] = a3; + c[2] = a2; + c[1] = a1; + c[0] = a0; + + return; +} + +#endif diff --git a/src/low/curve2251-sse/relic_fb_srt_low.c b/src/low/curve2251-sse/relic_fb_srt_low.c new file mode 100755 index 000000000..7bc0d8f9f --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_srt_low.c @@ -0,0 +1,184 @@ +/* + * RELIC is an Efficient LIbrary for Cryptography + * Copyright (C) 2007-2011 RELIC Authors + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file + * for contact information. + * + * RELIC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level binary field square root. + * + * @version $Id: relic_fb_srt_low.c 207 2009-12-25 20:15:28Z dfaranha $ + * @ingroup fb + */ + +#include +#include +#ifdef __PCLMUL__ +#include +#endif + +#include "relic_fb.h" +#include "relic_fb_low.h" +#include "relic_util.h" +#include "macros.h" + +/*============================================================================*/ +/* Private definitions */ +/*============================================================================*/ + +#define HALF ((int)((FB_BITS / 2)/(FB_DIGIT) + ((FB_BITS / 2) % FB_DIGIT > 0))) + +#ifndef __PCLMUL__ + +void fb_mulh_low(dig_t *c, dig_t *a) { + __m128i m0, m1, m2, m3, m8, m9, t0, t1; + unsigned char ta; + int j; + align dig_t x[2]; + dig_t *tab; + +#define LSHIFT8(m2,m1,m0)\ + m2=_mm_alignr_epi8(m2,m1,15);\ + m1=_mm_alignr_epi8(m1,m0,15);\ + m0=_mm_slli_si128(m0,1); + +#define M(m1,m0,ta)\ + tab = fb_poly_tab_srz(ta);\ + m0=_mm_xor_si128(m0, ((__m128i *)tab)[0]);\ + m1=_mm_xor_si128(m1, ((__m128i *)tab)[1]);\ + + // Main computation + m0 = m1 = m2 = m3 = _mm_setzero_si128(); + + for (j = 56; j >= 0; j -= 8) { + ta = (a[1] >> j) & 0xFF; + M(m1, m0, ta); + LSHIFT8(m2, m1, m0); + } + for (j = 56; j >= 8; j -= 8) { + ta = (a[0] >> j) & 0xFF; + M(m1, m0, ta); + LSHIFT8(m2, m1, m0); + } + ta = a[0] & 0xFF; + M(m1, m0, ta); + + RED251(m2,m1,m0); \ + m8 = _mm_srli_si128(m1,8); \ + m9 = _mm_srli_epi64(m8,59); \ + m9 = _mm_slli_epi64(m9,59); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,59)); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,57)); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,55)); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,52)); \ + + _mm_store_si128((__m128i *) c + 0, m0); + _mm_store_si128((__m128i *) x, m1); + c[2] = x[0]; + c[3] = x[1] & 0x07FFFFFFFFFFFFFF; +#undef M +} + +#else + +void fb_mulh_low(dig_t *c, dig_t *a) { + __m128i ma0, ma1, mb0, mb1, m0, m1, m2, m3, m4, m5, m8, m9, t0, t1, t2, t3; + dig_t *b = fb_poly_get_srz(); + + ma0 = _mm_load_si128((__m128i *)a); + mb0 = _mm_load_si128((__m128i *)b); + mb1 = _mm_load_si128((__m128i *)b + 1); + + MUL(ma0, mb0); + m0 = t0; + m1 = t1; + + mb0 = XOR(mb0, mb1); + + MUL(ma0, mb0); + m4 = _mm_xor_si128(t0, m0); + m2 = _mm_xor_si128(t1, m1); + + m1 = XOR(m1, m4); + + align dig_t _x[2]; + + RED251(m2,m1,m0); \ + m8 = _mm_srli_si128(m1,8); \ + m9 = _mm_srli_epi64(m8,59); \ + m9 = _mm_slli_epi64(m9,59); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,59)); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,57)); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,55)); \ + m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,52)); \ + _mm_store_si128((__m128i *) c + 0, m0); + _mm_store_si128((__m128i *) _x, m1); + c[2] = _x[0]; + c[3] = _x[1] & 0x07FFFFFFFFFFFFFF; + return; +#undef M +} + +#endif + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +void fb_srtn_low(dig_t *c, dig_t *a) { + __m128i m0, m1, m2, perm, mask0, mask1, sqrt0, sqrt1; + align dig_t x[2], d0, d1; + align dig_t t_e[FB_DIGS] = {0}, t_o[FB_DIGS] = {0}; + int i, n; + + //sqrt1 = sqrt0<<2 + sqrt0 = _mm_set_epi32(0x33322322, 0x31302120, 0x13120302, 0x11100100); + sqrt1 = _mm_set_epi32(0xccc88c88, 0xc4c08480, 0x4c480c08, 0x44400400); + perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200); + mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0); + mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); + + n = 0; + for (i = 0; i < FB_DIGS; i += 2) { + m1 = _mm_load_si128((__m128i *) & a[i]); + m0 = _mm_shuffle_epi8(m1, perm); + m1 = _mm_and_si128(m0, mask0); + m2 = _mm_and_si128(m0, mask1); + m2 = _mm_srli_epi64(m2, 4); + m2 = _mm_shuffle_epi8(sqrt1, m2); + m1 = _mm_shuffle_epi8(sqrt0, m1); + m2 = _mm_or_si128(m1, m2); + m0 = _mm_and_si128(m2, mask0); + m1 = _mm_and_si128(m2, mask1); + _mm_store_si128((__m128i *) x, m0); + d0 = x[0] | (x[1] << 4); + _mm_store_si128((__m128i *) x, m1); + d1 = x[1] | (x[0] >> 4); + t_e[n] = d0; + t_o[n] = d1; + n++; + } + + fb_mulh_low(c, t_o); + for (i = 0; i < HALF; i++) { + c[i] ^= t_e[i]; + } +} diff --git a/src/low/curve2251-sse/relic_fb_trc_low.c b/src/low/curve2251-sse/relic_fb_trc_low.c new file mode 100644 index 000000000..6b251747e --- /dev/null +++ b/src/low/curve2251-sse/relic_fb_trc_low.c @@ -0,0 +1,41 @@ +/* + * RELIC is an Efficient LIbrary for Cryptography + * Copyright (C) 2007-2011 RELIC Authors + * + * This file is part of RELIC. RELIC is legal property of its developers, + * whose names are not listed here. Please refer to the COPYRIGHT file + * for contact information. + * + * RELIC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * RELIC is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with RELIC. If not, see . + */ + +/** + * @file + * + * Implementation of the low-level trace function. + * + * @version $Id: relic_fb_slv_low.c 652 2011-02-20 23:50:00Z dfaranha $ + * @ingroup fb + */ + +#include "relic_fb.h" +#include "relic_fb_low.h" + +/*============================================================================*/ +/* Public definitions */ +/*============================================================================*/ + +dig_t fb_trcn_low(dig_t *a) { + return (a[0] ^(a[3] >> 55) ^(a[3] >> 57)) & 0x01; +} diff --git a/test/test_fb.c b/test/test_fb.c index 4560bc7d8..07587ea98 100644 --- a/test/test_fb.c +++ b/test/test_fb.c @@ -690,15 +690,21 @@ static int reduction(void) { dv_new(t1); TEST_BEGIN("modular reduction is correct") { - fb_rand(a); - /* Test if a * f(z) mod f(z) == 0. */ - fb_mul(b, a, fb_poly_get()); if (FB_POLYN % FB_DIGIT == 0) { + /* Test if a * f(z) mod f(z) == 0. */ + fb_rand(a); + fb_mul(b, a, fb_poly_get()); fb_copy(t0, b); fb_copy(t0 + FB_DIGS, a); - fb_rdc(b, t0); + fb_rdc(a, t0); + } else { + /* Test if f(z) * z^(m-1) mod f(z) == 0. */ + dv_zero(t0, FB_DIGS); + t0[FB_DIGS - 1] = (dig_t)1 << (FB_DIGIT - 1); + fb_rsh(t0 + FB_DIGS, fb_poly_get(), 1); + fb_rdc(a, t0); } - TEST_ASSERT(fb_is_zero(b) == 1, end); + TEST_ASSERT(fb_is_zero(a) == 1, end); } TEST_END;