diff --git a/src/low/curve2251-sse/CMakeLists.txt b/src/low/curve2251-sse/CMakeLists.txt
new file mode 100644
index 000000000..0558cc2e4
--- /dev/null
+++ b/src/low/curve2251-sse/CMakeLists.txt
@@ -0,0 +1 @@
+set(ARITH_LIBS "gmp")
\ No newline at end of file
diff --git a/src/low/curve2251-sse/macros.h b/src/low/curve2251-sse/macros.h
new file mode 100755
index 000000000..d6a163919
--- /dev/null
+++ b/src/low/curve2251-sse/macros.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2007 Project RELIC
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file.
+ *
+ * RELIC is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Useful macros for binary field arithmetic.
+ *
+ * @version $Id$
+ * @ingroup fb
+ */
+
+#define PSHUFB(A, B) _mm_shuffle_epi8(A, B);
+#define SHL64(A, B) _mm_slli_epi64(A, B)
+#define SHR64(A, B) _mm_srli_epi64(A, B)
+#define XOR(A, B) _mm_xor_si128(A, B)
+#define SHL8(A, B) _mm_slli_si128(A, B)
+#define SHR8(A, B) _mm_srli_si128(A, B)
+#define AND(A, B) _mm_and_si128(A, B)
+
+#define MUL(ma, mb) \
+ t0 = _mm_clmulepi64_si128(ma, mb, 0x00);\
+ t1 = _mm_clmulepi64_si128(ma, mb, 0x11);\
+ t2 = XOR(SHR8(ma, 8), ma);\
+ t3 = XOR(SHR8(mb, 8), mb);\
+ t2 = _mm_clmulepi64_si128(t2, t3, 0x00);\
+ t2 = XOR(t2, t0);\
+ t2 = XOR(t2, t1);\
+ t3 = SHR8(t2, 8);\
+ t2 = SHL8(t2, 8);\
+ t0 = XOR(t0, t2);\
+ t1 = XOR(t1, t3);\
+
+#define MULDXS(ma, mb) \
+ t0 = _mm_clmulepi64_si128(ma, mb, 0x00);\
+ t2 = _mm_clmulepi64_si128(ma, mb, 0x01);\
+ t1 = SHR8(t2, 8);\
+ t2 = SHL8(t2, 8);\
+ t0 = XOR(t0, t2);\
+
+#define MULSXD(ma, mb) \
+ MULDXS(mb, ma)
+
+#define RED251(t,m1,m0)\
+ t0 = _mm_slli_si128(t,8);\
+ t1 = _mm_srli_si128(t,8);\
+ m1 = _mm_xor_si128(m1,_mm_srli_epi64(t1,59));\
+ m1 = _mm_xor_si128(m1,_mm_srli_epi64(t1,57));\
+ m1 = _mm_xor_si128(m1,_mm_srli_epi64(t1,55));\
+ m1 = _mm_xor_si128(m1,_mm_srli_epi64(t1,52));\
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(t0,59));\
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(t0,57));\
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(t0,55));\
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(t0,52));\
+ t0 = _mm_srli_si128(t0,8);\
+ t1 = _mm_slli_si128(t1,8);\
+ m0 = _mm_xor_si128(m0,_mm_slli_epi64(t0,5));\
+ m0 = _mm_xor_si128(m0,_mm_slli_epi64(t0,7));\
+ m0 = _mm_xor_si128(m0,_mm_slli_epi64(t0,9));\
+ m0 = _mm_xor_si128(m0,_mm_slli_epi64(t0,12));\
+ m0 = _mm_xor_si128(m0,_mm_slli_epi64(t1,5));\
+ m0 = _mm_xor_si128(m0,_mm_slli_epi64(t1,7));\
+ m0 = _mm_xor_si128(m0,_mm_slli_epi64(t1,9));\
+ m0 = _mm_xor_si128(m0,_mm_slli_epi64(t1,12));
+
+#define REDUCE() \
+ RED251(m3,m2,m1); \
+ RED251(m2,m1,m0); \
+ m8 = _mm_srli_si128(m1,8); \
+ m9 = _mm_srli_epi64(m8,59); \
+ m9 = _mm_slli_epi64(m9,59); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,59)); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,57)); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,55)); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,52)); \
+
diff --git a/src/low/curve2251-sse/relic_bn_div_low.c b/src/low/curve2251-sse/relic_bn_div_low.c
new file mode 100644
index 000000000..44aac9c2a
--- /dev/null
+++ b/src/low/curve2251-sse/relic_bn_div_low.c
@@ -0,0 +1,47 @@
+/*
+ * RELIC is an Efficient LIbrary for Cryptography
+ * Copyright (C) 2007-2011 RELIC Authors
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file
+ * for contact information.
+ *
+ * RELIC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level multiple precision division functions.
+ *
+ * @version $Id: relic_bn_div_low.c 677 2011-03-05 22:19:43Z dfaranha $
+ * @ingroup bn
+ */
+
+#include
+
+#include "relic_bn.h"
+#include "relic_bn_low.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void bn_divn_low(dig_t *c, dig_t *d, dig_t *a, int sa, dig_t *b, int sb) {
+ mpn_tdiv_qr(c, d, 0, a, sa, b, sb);
+}
+
+void bn_div1_low(dig_t *c, dig_t *d, dig_t *a, int size, dig_t b) {
+ *d = mpn_divrem_1(c, 0, a, size, b);
+}
diff --git a/src/low/curve2251-sse/relic_bn_mod_low.c b/src/low/curve2251-sse/relic_bn_mod_low.c
new file mode 100644
index 000000000..5b4b4025b
--- /dev/null
+++ b/src/low/curve2251-sse/relic_bn_mod_low.c
@@ -0,0 +1,62 @@
+/*
+ * RELIC is an Efficient LIbrary for Cryptography
+ * Copyright (C) 2007-2011 RELIC Authors
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file
+ * for contact information.
+ *
+ * RELIC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level multiple precision integer modular reduction
+ * functions.
+ *
+ * @version $Id: relic_bn_mod_low.c 677 2011-03-05 22:19:43Z dfaranha $
+ * @ingroup bn
+ */
+
+#include
+#include
+
+#include "relic_bn.h"
+#include "relic_bn_low.h"
+#include "relic_util.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void bn_modn_low(dig_t *c, dig_t *a, int sa, dig_t *m, int sm, dig_t u) {
+ int i;
+ dig_t r, carry, *tmpc;
+
+ tmpc = c;
+
+ for (i = 0; i < sa; i++, tmpc++, a++) {
+ *tmpc = *a;
+ }
+
+ tmpc = c;
+
+ for (i = 0; i < sm; i++, tmpc++) {
+ r = (dig_t)(*tmpc * u);
+ carry = mpn_addmul_1(tmpc, m, sm, r);
+ mpn_add_1(tmpc + sm, tmpc + sm, sm - i + 1, carry);
+ }
+ bn_rshd_low(c, c, 2 * sm + 1, sm);
+}
diff --git a/src/low/curve2251-sse/relic_bn_mul_low.c b/src/low/curve2251-sse/relic_bn_mul_low.c
new file mode 100644
index 000000000..502ad438b
--- /dev/null
+++ b/src/low/curve2251-sse/relic_bn_mul_low.c
@@ -0,0 +1,60 @@
+/*
+ * RELIC is an Efficient LIbrary for Cryptography
+ * Copyright (C) 2007-2011 RELIC Authors
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file
+ * for contact information.
+ *
+ * RELIC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the multiple precision integer arithmetic multiplication
+ * functions.
+ *
+ * @version $Id: relic_bn_mul_low.c 976 2012-01-07 02:21:45Z dfaranha $
+ * @ingroup bn
+ */
+
+#include
+
+#include "relic_bn.h"
+#include "relic_bn_low.h"
+#include "relic_util.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+dig_t bn_muladd_low(dig_t *c, dig_t *a, dig_t digit, int size) {
+ return mpn_addmul_1(c, a, size, digit);
+}
+
+dig_t bn_mul1_low(dig_t *c, dig_t *a, dig_t digit, int size) {
+ return mpn_mul_1(c, a, size, digit);
+}
+
+void bn_muln_low(dig_t *c, dig_t *a, dig_t *b, int size) {
+ mpn_mul_n(c, a, b, size);
+}
+
+void bn_muld_low(dig_t *c, dig_t *a, int sizea, dig_t *b, int sizeb,
+ int low, int high) {
+ (void) low;
+ (void) high;
+ mpn_mul(c, a, sizea, b, sizeb);
+}
diff --git a/src/low/curve2251-sse/relic_bn_sqr_low.c b/src/low/curve2251-sse/relic_bn_sqr_low.c
new file mode 100644
index 000000000..c35bddee8
--- /dev/null
+++ b/src/low/curve2251-sse/relic_bn_sqr_low.c
@@ -0,0 +1,59 @@
+/*
+ * RELIC is an Efficient LIbrary for Cryptography
+ * Copyright (C) 2007-2011 RELIC Authors
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file
+ * for contact information.
+ *
+ * RELIC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the multiple precision integer arithmetic multiplication
+ * functions.
+ *
+ * @version $Id: relic_bn_sqr_low.c 677 2011-03-05 22:19:43Z dfaranha $
+ * @ingroup bn
+ */
+
+#include
+
+#include "relic_bn.h"
+#include "relic_bn_low.h"
+#include "relic_util.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void bn_sqradd_low(dig_t *c, dig_t *a, int size) {
+ dig_t carry;
+ dig_t digit;
+
+ digit = *a;
+
+ carry = mpn_addmul_1(c, a, size, digit);
+ mpn_add_1(c+size, c+size, size, carry);
+ if (size - 1 > 0) {
+ carry = mpn_addmul_1(c+1, a+1, size-1, digit);
+ mpn_add_1(c+size, c+size, size, carry);
+ }
+}
+
+void bn_sqrn_low(dig_t *c, dig_t *a, int size) {
+ mpn_mul_n(c, a, a, size);
+}
diff --git a/src/low/curve2251-sse/relic_fb_add_low.c b/src/low/curve2251-sse/relic_fb_add_low.c
new file mode 100755
index 000000000..941a3f5df
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_add_low.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2007 Project RELIC
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file.
+ *
+ * RELIC is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level binary field addition and subtraction
+ * functions.
+ *
+ * @version $Id$
+ * @ingroup fb
+ */
+
+#include
+#include
+#ifdef __PCLMUL__
+#include
+#endif
+
+#include
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+
+#include "macros.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void fb_add1_low(dig_t *c, dig_t *a, dig_t digit) {
+ int i;
+
+ (*c) = (*a) ^ digit;
+ c++;
+ a++;
+ for (i = 0; i < FB_DIGS - 1; i++, a++, c++)
+ (*c) = (*a);
+}
+
+void fb_addn_low(dig_t *c, dig_t *a, dig_t *b) {
+ *(__m128i *)c = XOR(*(__m128i*)(a), *(__m128i*)(b));
+ *(__m128i *)(c + 2) = XOR(*(__m128i*)(a + 2), *(__m128i*)(b + 2));
+}
+
+void fb_addd_low(dig_t *c, dig_t *a, dig_t *b, int size) {
+ if (size == 2 * FB_DIGS) {
+ *(__m128i *)c = XOR(*(__m128i*)(a), *(__m128i*)(b));
+ *(__m128i *)(c + 2) = XOR(*(__m128i*)(a + 2), *(__m128i*)(b + 2));
+ *(__m128i *)(c + 4) = XOR(*(__m128i*)(a + 4), *(__m128i*)(b + 4));
+ *(__m128i *)(c + 6) = XOR(*(__m128i*)(a + 6), *(__m128i*)(b + 6));
+ } else {
+ for (int i = 0; i < size; i++, a++, b++, c++) {
+ (*c) = (*a) ^ (*b);
+ }
+ }
+}
diff --git a/src/low/curve2251-sse/relic_fb_inv_low.c b/src/low/curve2251-sse/relic_fb_inv_low.c
new file mode 100755
index 000000000..70fd73aed
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_inv_low.c
@@ -0,0 +1,100 @@
+/*
+ * RELIC is an Efficient LIbrary for Cryptography
+ * Copyright (C) 2007-2011 RELIC Authors
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file
+ * for contact information.
+ *
+ * RELIC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level inversion functions.
+ *
+ * @version $Id: relic_fb_inv_low.c 553 2010-11-01 22:59:38Z dfaranha $
+ * @ingroup fb
+ */
+
+#include
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void fb_invn_low(dig_t *c, dig_t *a) {
+ int i, j, x, y;
+ int *chain, len;
+
+ chain = fb_poly_get_chain(&len);
+
+ int u[len + 1];
+ fb_t table[len + 1];
+ for (i = 0; i <= len; i++) {
+ fb_null(table[i]);
+ }
+
+ for (i = 0; i <= len; i++) {
+ fb_new(table[i]);
+ }
+
+ u[0] = 1;
+ u[1] = 2;
+ fb_copy(table[0], a);
+ fb_sqr(table[1], table[0]);
+ fb_mul(table[1], table[1], table[0]);
+
+ u[2] = u[1] + u[0];
+ fb_sqr(table[2], table[1]);
+ fb_mul(table[2], table[2], table[0]);
+
+ u[3] = u[2] + u[1];
+ fb_sqr(table[3], table[2]);
+ for (j = 1; j < u[1]; j++) {
+ fb_sqr(table[3], table[3]);
+ }
+ fb_mul(table[3], table[3], table[1]);
+
+ u[4] = 2 * u[3];
+ fb_sqr(table[4], table[3]);
+ for (j = 1; j < u[3]; j++) {
+ fb_sqr(table[4], table[4]);
+ }
+ fb_mul(table[4], table[4], table[3]);
+
+ u[5] = u[4] + u[3];
+ fb_sqr(table[5], table[4]);
+ for (j = 1; j < u[3]; j++) {
+ fb_sqr(table[5], table[5]);
+ }
+ fb_mul(table[5], table[5], table[3]);
+
+ for (i = 6; i <= len; i++) {
+ x = chain[i - 1] >> 8;
+ y = chain[i - 1] - (x << 8);
+ if (x == y) {
+ u[i] = 2 * u[i - 1];
+ } else {
+ u[i] = u[x] + u[y];
+ }
+ dig_t *tab = (dig_t *)fb_poly_tab_sqr(y);
+ fb_itr(table[i], table[x], u[y], (void *)tab);
+ fb_mul(table[i], table[i], table[y]);
+ }
+ fb_sqr(c, table[len]);
+}
diff --git a/src/low/curve2251-sse/relic_fb_itr_low.c b/src/low/curve2251-sse/relic_fb_itr_low.c
new file mode 100644
index 000000000..33737d6e0
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_itr_low.c
@@ -0,0 +1,69 @@
+/*
+ * RELIC is an Efficient LIbrary for Cryptography
+ * Copyright (C) 2007-2011 RELIC Authors
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file
+ * for contact information.
+ *
+ * RELIC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level iterated squaring/square-root.
+ *
+ * @version $Id: relic_fb_sqr_low.c 677 2011-03-05 22:19:43Z dfaranha $
+ * @ingroup fb
+ */
+
+#include
+#include
+
+#include "relic_fb.h"
+#include "relic_dv.h"
+#include "relic_fb_low.h"
+#include "relic_util.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void fb_itrn_low(dig_t *c, dig_t *a, dig_t *t) {
+ int i, j;
+ dig_t u, *tmp, *p;
+
+ __m128i r0, r1;
+ r0 = r1 = _mm_setzero_si128();
+ for (i = FB_DIGIT - 4; i >= 0; i -= 4) {
+ tmp = a;
+ for (j = 0; j < FB_DIGS - 1; j++, tmp++) {
+ u = (*tmp >> i) & 0x0F;
+ p = (t + ((j * FB_DIGIT + i) * 4 + u) * FB_DIGS);
+ r0 = _mm_xor_si128(r0, *(__m128i *)(p));
+ r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
+ }
+ }
+ for (i = FB_DIGIT - 8; i >= 0; i -= 4) {
+ tmp = a + FB_DIGS - 1;
+ u = (*tmp >> i) & 0x0F;
+ p = (t + ((j * FB_DIGIT + i) * 4 + u) * FB_DIGS);
+ r0 = _mm_xor_si128(r0, *(__m128i *)(p));
+ r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
+ }
+
+ _mm_store_si128((__m128i *)c, r0);
+ _mm_store_si128((__m128i *)(c + 2), r1);
+}
diff --git a/src/low/curve2251-sse/relic_fb_mul_low.c b/src/low/curve2251-sse/relic_fb_mul_low.c
new file mode 100755
index 000000000..f91dfdba7
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_mul_low.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2007 Project RELIC
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file.
+ *
+ * RELIC is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level binary field bit multiplication functions.
+ *
+ * @version $Id$
+ * @ingroup fb
+ */
+
+#include
+#include
+
+#include
+
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+#include "relic_bn_low.h"
+#include "relic_util.h"
+#include "macros.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void fb_muld_low(dig_t *c, dig_t *a, dig_t *b, int size) {
+ dv_t table[16];
+ dig_t u, *tmpa, *tmpc, r0, r1, r2, r4, r8;
+ int i, j;
+
+ dv_zero(c, 2 * size);
+
+ for (i = 0; i < 16; i++) {
+ dv_new(table[i]);
+ dv_zero(table[i], size + 1);
+ }
+
+ u = 0;
+ for (i = 0; i < size; i++) {
+ r1 = r0 = b[i];
+ r2 = (r0 << 1) | (u >> (FB_DIGIT - 1));
+ r4 = (r0 << 2) | (u >> (FB_DIGIT - 2));
+ r8 = (r0 << 3) | (u >> (FB_DIGIT - 3));
+ table[0][i] = 0;
+ table[1][i] = r1;
+ table[2][i] = r2;
+ table[3][i] = r1 ^ r2;
+ table[4][i] = r4;
+ table[5][i] = r1 ^ r4;
+ table[6][i] = r2 ^ r4;
+ table[7][i] = r1 ^ r2 ^ r4;
+ table[8][i] = r8;
+ table[9][i] = r1 ^ r8;
+ table[10][i] = r2 ^ r8;
+ table[11][i] = r1 ^ r2 ^ r8;
+ table[12][i] = r4 ^ r8;
+ table[13][i] = r1 ^ r4 ^ r8;
+ table[14][i] = r2 ^ r4 ^ r8;
+ table[15][i] = r1 ^ r2 ^ r4 ^ r8;
+ u = r1;
+ }
+
+ if (u > 0) {
+ r2 = u >> (FB_DIGIT - 1);
+ r4 = u >> (FB_DIGIT - 2);
+ r8 = u >> (FB_DIGIT - 3);
+ table[0][size] = table[1][size] = 0;
+ table[2][size] = table[3][size] = r2;
+ table[4][size] = table[5][size] = r4;
+ table[6][size] = table[7][size] = r2 ^ r4;
+ table[8][size] = table[9][size] = r8;
+ table[10][size] = table[11][size] = r2 ^ r8;
+ table[12][size] = table[13][size] = r4 ^ r8;
+ table[14][size] = table[15][size] = r2 ^ r4 ^ r8;
+ }
+
+ for (i = FB_DIGIT - 4; i > 0; i -= 4) {
+ tmpa = a;
+ tmpc = c;
+ for (j = 0; j < size; j++, tmpa++, tmpc++) {
+ u = (*tmpa >> i) & 0x0F;
+ fb_addd_low(tmpc, tmpc, table[u], size + 1);
+ }
+ bn_lshb_low(c, c, 2 * size, 4);
+ }
+ for (j = 0; j < size; j++, a++, c++) {
+ u = *a & 0x0F;
+ fb_addd_low(c, c, table[u], size + 1);
+ }
+ for (i = 0; i < 16; i++) {
+ dv_free(table[i]);
+ }
+}
+
+#if defined(__PCLMUL__) || defined(__INTEL_COMPILER)
+#include "relic_fb_mul_low_cl.c"
+#else
+#ifndef SHUFFLE
+#include "relic_fb_mul_low_ld.c"
+#else
+#include "relic_fb_mul_low_sf.c"
+#endif
+#endif
diff --git a/src/low/curve2251-sse/relic_fb_mul_low_cl.c b/src/low/curve2251-sse/relic_fb_mul_low_cl.c
new file mode 100755
index 000000000..69c802233
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_mul_low_cl.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright 2007 Project RELIC
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file.
+ *
+ * RELIC is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level binary field bit multiplication functions.
+ *
+ * @version $Id$
+ * @ingroup fb
+ */
+
+#include
+
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+#include "relic_bn_low.h"
+#include "relic_util.h"
+#include "macros.h"
+
+#include
+#include
+#include
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void fb_muln_low(dig_t *c, dig_t *a, dig_t *b) {
+ __m128i ma0, ma1, mb0, mb1, m0, m1, m2, m3, m4, m5, t0, t1, t2, t3;
+
+ ma0 = _mm_load_si128((__m128i *)a);
+ mb0 = _mm_load_si128((__m128i *)b);
+
+ MUL(ma0, mb0);
+ m0 = t0;
+ m1 = t1;
+
+ ma1 = _mm_load_si128((__m128i *)a + 1);
+ mb1 = _mm_load_si128((__m128i *)b + 1);
+ MUL(ma1, mb1);
+ m2 = t0;
+ m3 = t1;
+
+ ma0 = XOR(ma0, ma1);
+ mb0 = XOR(mb0, mb1);
+
+ MUL(ma0, mb0);
+ m4 = t0;
+ m5 = t1;
+
+ m4 = _mm_xor_si128(m4, m0);
+ m5 = _mm_xor_si128(m5, m1);
+ m4 = _mm_xor_si128(m4, m2);
+ m5 = _mm_xor_si128(m5, m3);
+
+ m1 = XOR(m1, m4);
+ m2 = XOR(m2, m5);
+
+ _mm_store_si128((__m128i *) c + 0, m0);
+ _mm_store_si128((__m128i *) c + 1, m1);
+ _mm_store_si128((__m128i *) c + 2, m2);
+ _mm_store_si128((__m128i *) c + 3, m3);
+
+}
+
+#if !defined(__INTEL_COMPILER)
+
+void fb_mulm_low(dig_t *c, dig_t *a, dig_t *b) {
+ __m128i ma0, ma1, mb0, mb1, m0, m1, m2, m3, m4, m5, t0, t1, t2, t3;
+ align dig_t t[2*FB_DIGS];
+
+ ma0 = _mm_load_si128((__m128i *)a);
+ mb0 = _mm_load_si128((__m128i *)b);
+
+ MUL(ma0, mb0);
+ m0 = t0;
+ m1 = t1;
+
+ ma1 = _mm_load_si128((__m128i *)a + 1);
+ mb1 = _mm_load_si128((__m128i *)b + 1);
+ MUL(ma1, mb1);
+ m2 = t0;
+ m3 = t1;
+
+ ma0 = XOR(ma0, ma1);
+ mb0 = XOR(mb0, mb1);
+
+ MUL(ma0, mb0);
+ m4 = t0;
+ m5 = t1;
+
+ m4 = _mm_xor_si128(m4, m0);
+ m5 = _mm_xor_si128(m5, m1);
+ m4 = _mm_xor_si128(m4, m2);
+ m5 = _mm_xor_si128(m5, m3);
+
+ m1 = XOR(m1, m4);
+ m2 = XOR(m2, m5);
+
+ _mm_store_si128((__m128i *) t + 0, m0);
+ _mm_store_si128((__m128i *) t + 1, m1);
+ _mm_store_si128((__m128i *) t + 2, m2);
+ _mm_store_si128((__m128i *) t + 3, m3);
+
+ const int ra = 52;
+ const int rb = 55;
+ const int rc = 57;
+ const int rh = 59;
+ const int lh = 5;
+ const int la = 12;
+ const int lb = 9;
+ const int lc = 7;
+
+ dig_t d = t[7], a0 = t[0], a1 = t[1], a2 = t[2], a3 = t[3], a4 = t[4];
+
+ a4 ^= (d >> rh);
+ a4 ^= (d >> ra);
+ a4 ^= (d >> rb);
+ a4 ^= (d >> rc);
+
+ a3 ^= (d << lh);
+ a3 ^= (d << la);
+ a3 ^= (d << lb);
+ a3 ^= (d << lc);
+
+ d = t[6];
+ a3 ^= (d >> rh);
+ a3 ^= (d >> ra);
+ a3 ^= (d >> rb);
+ a3 ^= (d >> rc);
+
+ a2 ^= (d << lh);
+ a2 ^= (d << la);
+ a2 ^= (d << lb);
+ a2 ^= (d << lc);
+
+ d = t[5];
+ a2 ^= (d >> rh);
+ a2 ^= (d >> ra);
+ a2 ^= (d >> rb);
+ a2 ^= (d >> rc);
+
+ a1 ^= (d << lh);
+ a1 ^= (d << la);
+ a1 ^= (d << lb);
+ a1 ^= (d << lc);
+
+ d = a4;
+ a1 ^= (d >> rh);
+ a1 ^= (d >> ra);
+ a1 ^= (d >> rb);
+ a1 ^= (d >> rc);
+
+ a0 ^= (d << lh);
+ a0 ^= (d << la);
+ a0 ^= (d << lb);
+ a0 ^= (d << lc);
+
+ d = a3 >> rh;
+ a0 ^= d;
+ d <<= rh;
+
+ a0 ^= (d >> ra);
+ a0 ^= (d >> rb);
+ a0 ^= (d >> rc);
+ a3 ^= d;
+
+ c[3] = a3;
+ c[2] = a2;
+ c[1] = a1;
+ c[0] = a0;
+
+ return;
+}
+
+#else
+
+void fb_mulm_low(dig_t *c, dig_t *a, dig_t *b) {
+ __m128i ma0, ma1, mb0, mb1, m0, m1, m2, m3, m4, m5, m8, m9, t0, t1, t2, t3;
+
+ ma0 = _mm_load_si128((__m128i *)a);
+ mb0 = _mm_load_si128((__m128i *)b);
+
+ MUL(ma0, mb0);
+ m0 = t0;
+ m1 = t1;
+
+ ma1 = _mm_load_si128((__m128i *)a + 1);
+ mb1 = _mm_load_si128((__m128i *)b + 1);
+ MUL(ma1, mb1);
+ m2 = t0;
+ m3 = t1;
+
+ ma0 = XOR(ma0, ma1);
+ mb0 = XOR(mb0, mb1);
+
+ MUL(ma0, mb0);
+ m4 = t0;
+ m5 = t1;
+
+ m4 = _mm_xor_si128(m4, m0);
+ m5 = _mm_xor_si128(m5, m1);
+ m4 = _mm_xor_si128(m4, m2);
+ m5 = _mm_xor_si128(m5, m3);
+
+ m1 = XOR(m1, m4);
+ m2 = XOR(m2, m5);
+
+ align dig_t _x[2];
+
+ REDUCE();
+ _mm_store_si128((__m128i *) c + 0, m0);
+ _mm_store_si128((__m128i *) _x, m1);
+ c[2] = _x[0];
+ c[3] = _x[1] & 0x07FFFFFFFFFFFFFF;
+ return;
+}
+
+#endif
+
+void fb_mul1_low(dig_t *c, dig_t *a, dig_t digit) {
+ __m128i ma, mb, m0, m1, m2, t0, t1, t2;
+
+ ma = _mm_load_si128((__m128i *)a);
+ mb = _mm_set_epi32(0, 0, digit >> 32, digit & 0xFFFFFFFF);
+
+ MULDXS(ma, mb);
+ m0 = t0;
+ m1 = t1;
+
+ ma = _mm_load_si128((__m128i *)a + 1);
+ MULDXS(ma, mb);
+ m1 = XOR(m1, t0);
+ m2 = t1;
+
+ _mm_store_si128((__m128i *) c + 0, m0);
+ _mm_store_si128((__m128i *) c + 1, m1);
+ _mm_store_si128((__m128i *) c + 2, m2);
+}
diff --git a/src/low/curve2251-sse/relic_fb_mul_low_ld.c b/src/low/curve2251-sse/relic_fb_mul_low_ld.c
new file mode 100755
index 000000000..15420960f
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_mul_low_ld.c
@@ -0,0 +1,483 @@
+/*
+ * Copyright 2007 Project RELIC
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file.
+ *
+ * RELIC is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level binary field bit multiplication functions.
+ *
+ * @version $Id$
+ * @ingroup fb
+ */
+
+#include
+
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+#include "relic_bn_low.h"
+#include "relic_util.h"
+#include "macros.h"
+
+#include
+#include
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void fb_mul1_low(dig_t *c, dig_t *a, dig_t digit) {
+ int j, k;
+ dig_t b1, b2;
+
+ if (digit == 0) {
+ dv_zero(c, FB_DIGS + 1);
+ return;
+ }
+ if (digit == 1) {
+ fb_copy(c, a);
+ return;
+ }
+ c[FB_DIGS] = fb_lshb_low(c, a, util_bits_dig(digit) - 1);
+ for (int i = util_bits_dig(digit) - 2; i > 0; i--) {
+ if (digit & ((dig_t)1 << i)) {
+ j = FB_DIGIT - i;
+ b1 = a[0];
+ c[0] ^= (b1 << i);
+ for (k = 1; k < FB_DIGS; k++) {
+ b2 = a[k];
+ c[k] ^= ((b2 << i) | (b1 >> j));
+ b1 = b2;
+ }
+ c[FB_DIGS] ^= (b1 >> j);
+ }
+ }
+ if (digit & (dig_t)1) {
+ fb_add(c, c, a);
+ }
+}
+
+void fb_muln_low(dig_t *c, dig_t *a, dig_t *b) {
+ __m128i tab[16][8], tab1[16][8];
+ __m128i s0, m0, m1, m2, m3, m4, m8, m9;
+ char ta, tb;
+ int i, j, k;
+
+#define LOOKUP(i, T)\
+ T[0][i] = _mm_setzero_si128();\
+ T[1][i] = m0;\
+ T[2][i] = m1;\
+ T[3][i] = m9=_mm_xor_si128(m0,m1);\
+ T[4][i] = m2;\
+ T[5][i] = _mm_xor_si128(m2,m0);\
+ T[6][i] = _mm_xor_si128(m2,m1);\
+ T[7][i] = _mm_xor_si128(m2,m9);\
+ T[8][i] = m3;\
+ T[9][i] = _mm_xor_si128(m3,m0);\
+ T[10][i] = _mm_xor_si128(m3,m1);\
+ T[11][i] = _mm_xor_si128(m3,m9);\
+ T[12][i] = m2=_mm_xor_si128(m3,m2);\
+ T[13][i] = _mm_xor_si128(m2,m0);\
+ T[14][i] = _mm_xor_si128(m2,m1);\
+ T[15][i] = _mm_xor_si128(m2,m9);
+
+ s0 = _mm_setzero_si128();
+ for (i = 0; i < 2; i++) {
+ m0 = _mm_load_si128((__m128i *) (b + 2 * i));
+ m9 = _mm_srli_epi64(m0, 57);
+ m8 = _mm_slli_si128(m9, 8);
+ m8 = _mm_xor_si128(m8, s0);
+ s0 = _mm_srli_si128(m9, 8);
+ m1 = _mm_slli_epi64(m0, 1);
+ m2 = _mm_slli_epi64(m0, 2);
+ m3 = _mm_slli_epi64(m0, 3);
+ m3 = _mm_xor_si128(m3, _mm_srli_epi64(m8, 4));
+ m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 5));
+ m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 6));
+ LOOKUP(i, tab);
+ m4 = m0;
+ m0 = _mm_slli_epi64(m4, 4);
+ m1 = _mm_slli_epi64(m4, 5);
+ m2 = _mm_slli_epi64(m4, 6);
+ m3 = _mm_slli_epi64(m4, 7);
+ m3 = _mm_xor_si128(m3, m8);
+ m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 1));
+ m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 2));
+ m0 = _mm_xor_si128(m0, _mm_srli_epi64(m8, 3));
+ LOOKUP(i, tab1);
+ }
+ m3 = s0;
+ m2 = _mm_srli_epi64(s0, 1);
+ tab1[0][i] = tab1[1][i] = tab1[2][i] = tab1[3][i] = _mm_setzero_si128();
+ tab1[4][i] = tab1[5][i] = tab1[6][i] = tab1[7][i] = m2;
+ tab1[8][i] = tab1[9][i] = tab1[10][i] = tab1[11][i] = m3;
+ m2 =_mm_xor_si128(m3,m2);
+ tab1[12][i] = tab1[13][i] = tab1[14][i] = tab1[15][i] = m2;
+#undef LOOKUP
+
+#define LSHIFT8(m3,m2,m1,m0)\
+ m3=_mm_alignr_epi8(m3,m2,15);\
+ m2=_mm_alignr_epi8(m2,m1,15);\
+ m1=_mm_alignr_epi8(m1,m0,15);\
+ m0=_mm_slli_si128(m0,1);
+
+#define M(m2,m1,m0,ta,tb)\
+ m0=_mm_xor_si128(m0, tab[ta&0xf][0]);\
+ m1=_mm_xor_si128(m1, tab[ta&0xf][1]);\
+ m0=_mm_xor_si128(m0,tab1[tb&0xf][0]);\
+ m1=_mm_xor_si128(m1,tab1[tb&0xf][1]);\
+ m2=_mm_xor_si128(m2,tab1[tb&0xf][2]);\
+
+ // Main computation
+ m0 = m1 = m2 = m3 = _mm_setzero_si128();
+
+ for (j = 56; j >= 0; j -= 8) {
+ k = j + 4;
+ ta = (a[1] >> j);
+ tb = (a[1] >> k);
+ M(m2, m1, m0, ta, tb);
+ ta = (a[3] >> j);
+ tb = (a[3] >> k);
+ M(m3, m2, m1, ta, tb);
+ LSHIFT8(m3, m2, m1, m0);
+ }
+ for (j = 56; j >= 8; j -= 8) {
+ k = j + 4;
+ ta = (a[0] >> j);
+ tb = (a[0] >> k);
+ M(m2, m1, m0, ta, tb);
+ ta = (a[2] >> j);
+ tb = (a[2] >> k);
+ M(m3, m2, m1, ta, tb);
+ LSHIFT8(m3, m2, m1, m0);
+ }
+ ta = a[0];
+ tb = (a[0] >> 4);
+ M(m2, m1, m0, ta, tb);
+ ta = a[2];
+ tb = (a[2] >> 4);
+ M(m3, m2, m1, ta, tb);
+
+ _mm_store_si128((__m128i *) c + 0, m0);
+ _mm_store_si128((__m128i *) c + 1, m1);
+ _mm_store_si128((__m128i *) c + 2, m2);
+ _mm_store_si128((__m128i *) c + 3, m3);
+#undef M
+}
+
+#if !defined(__INTEL_COMPILER)
+
+void fb_mulm_low(dig_t *c, dig_t *a, dig_t *b) {
+ __m128i tab[16][8], tab1[16][8];
+ __m128i s0, m0, m1, m2, m3, m4, m8, m9;
+ align dig_t t[2*FB_DIGS];
+ char ta, tb;
+ int i, j, k;
+
+#define LOOKUP(i, T)\
+ T[0][i] = _mm_setzero_si128();\
+ T[1][i] = m0;\
+ T[2][i] = m1;\
+ T[3][i] = m9=_mm_xor_si128(m0,m1);\
+ T[4][i] = m2;\
+ T[5][i] = _mm_xor_si128(m2,m0);\
+ T[6][i] = _mm_xor_si128(m2,m1);\
+ T[7][i] = _mm_xor_si128(m2,m9);\
+ T[8][i] = m3;\
+ T[9][i] = _mm_xor_si128(m3,m0);\
+ T[10][i] = _mm_xor_si128(m3,m1);\
+ T[11][i] = _mm_xor_si128(m3,m9);\
+ T[12][i] = m2=_mm_xor_si128(m3,m2);\
+ T[13][i] = _mm_xor_si128(m2,m0);\
+ T[14][i] = _mm_xor_si128(m2,m1);\
+ T[15][i] = _mm_xor_si128(m2,m9);
+
+ s0 = _mm_setzero_si128();
+ for (i = 0; i < 2; i++) {
+ m0 = _mm_load_si128((__m128i *) (b + 2 * i));
+ m9 = _mm_srli_epi64(m0, 57);
+ m8 = _mm_slli_si128(m9, 8);
+ m8 = _mm_xor_si128(m8, s0);
+ s0 = _mm_srli_si128(m9, 8);
+ m1 = _mm_slli_epi64(m0, 1);
+ m2 = _mm_slli_epi64(m0, 2);
+ m3 = _mm_slli_epi64(m0, 3);
+ m3 = _mm_xor_si128(m3, _mm_srli_epi64(m8, 4));
+ m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 5));
+ m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 6));
+ LOOKUP(i, tab);
+ m4 = m0;
+ m0 = _mm_slli_epi64(m4, 4);
+ m1 = _mm_slli_epi64(m4, 5);
+ m2 = _mm_slli_epi64(m4, 6);
+ m3 = _mm_slli_epi64(m4, 7);
+ m3 = _mm_xor_si128(m3, m8);
+ m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 1));
+ m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 2));
+ m0 = _mm_xor_si128(m0, _mm_srli_epi64(m8, 3));
+ LOOKUP(i, tab1);
+ }
+ m3 = s0;
+ m2 = _mm_srli_epi64(s0, 1);
+ m1 = _mm_setzero_si128();//_mm_srli_epi64(m8, 2);
+ m0 = _mm_setzero_si128();//_mm_srli_epi64(m8, 3);
+ tab1[0][i] = tab1[1][i] = tab1[2][i] = tab1[3][i] = m9 = _mm_setzero_si128();
+ tab1[4][i] = tab1[5][i] = tab1[6][i] = tab1[7][i] = m2;
+ tab1[8][i] = tab1[9][i] = tab1[10][i] = tab1[11][i] = m3;
+ m2 =_mm_xor_si128(m3,m2);
+ tab1[12][i] = tab1[13][i] = tab1[14][i] = tab1[15][i] = m2;
+#undef LOOKUP
+
+#define LSHIFT8(m3,m2,m1,m0)\
+ m3=_mm_alignr_epi8(m3,m2,15);\
+ m2=_mm_alignr_epi8(m2,m1,15);\
+ m1=_mm_alignr_epi8(m1,m0,15);\
+ m0=_mm_slli_si128(m0,1);
+
+#define M(m2,m1,m0,ta,tb)\
+ m0=_mm_xor_si128(m0, tab[ta&0xf][0]);\
+ m1=_mm_xor_si128(m1, tab[ta&0xf][1]);\
+ m0=_mm_xor_si128(m0,tab1[tb&0xf][0]);\
+ m1=_mm_xor_si128(m1,tab1[tb&0xf][1]);\
+ m2=_mm_xor_si128(m2,tab1[tb&0xf][2]);\
+
+ // Main computation
+ m0 = m1 = m2 = m3 = _mm_setzero_si128();
+
+ for (j = 56; j >= 0; j -= 8) {
+ k = j + 4;
+ ta = (a[1] >> j);
+ tb = (a[1] >> k);
+ M(m2, m1, m0, ta, tb);
+ ta = (a[3] >> j);
+ tb = (a[3] >> k);
+ M(m3, m2, m1, ta, tb);
+ LSHIFT8(m3, m2, m1, m0);
+ }
+ for (j = 56; j >= 8; j -= 8) {
+ k = j + 4;
+ ta = (a[0] >> j);
+ tb = (a[0] >> k);
+ M(m2, m1, m0, ta, tb);
+ ta = (a[2] >> j);
+ tb = (a[2] >> k);
+ M(m3, m2, m1, ta, tb);
+ LSHIFT8(m3, m2, m1, m0);
+ }
+ ta = a[0];
+ tb = (a[0] >> 4);
+ M(m2, m1, m0, ta, tb);
+ ta = a[2];
+ tb = (a[2] >> 4);
+ M(m3, m2, m1, ta, tb);
+
+ _mm_store_si128((__m128i *) t + 0, m0);
+ _mm_store_si128((__m128i *) t + 1, m1);
+ _mm_store_si128((__m128i *) t + 2, m2);
+ _mm_store_si128((__m128i *) t + 3, m3);
+
+ const int ra = 52;
+ const int rb = 55;
+ const int rc = 57;
+ const int rh = 59;
+ const int lh = 5;
+ const int la = 12;
+ const int lb = 9;
+ const int lc = 7;
+
+ dig_t d = t[7], a0 = t[0], a1 = t[1], a2 = t[2], a3 = t[3], a4 = t[4];
+
+ a4 ^= (d >> rh);
+ a4 ^= (d >> ra);
+ a4 ^= (d >> rb);
+ a4 ^= (d >> rc);
+
+ a3 ^= (d << lh);
+ a3 ^= (d << la);
+ a3 ^= (d << lb);
+ a3 ^= (d << lc);
+
+ d = t[6];
+ a3 ^= (d >> rh);
+ a3 ^= (d >> ra);
+ a3 ^= (d >> rb);
+ a3 ^= (d >> rc);
+
+ a2 ^= (d << lh);
+ a2 ^= (d << la);
+ a2 ^= (d << lb);
+ a2 ^= (d << lc);
+
+ d = t[5];
+ a2 ^= (d >> rh);
+ a2 ^= (d >> ra);
+ a2 ^= (d >> rb);
+ a2 ^= (d >> rc);
+
+ a1 ^= (d << lh);
+ a1 ^= (d << la);
+ a1 ^= (d << lb);
+ a1 ^= (d << lc);
+
+ d = a4;
+ a1 ^= (d >> rh);
+ a1 ^= (d >> ra);
+ a1 ^= (d >> rb);
+ a1 ^= (d >> rc);
+
+ a0 ^= (d << lh);
+ a0 ^= (d << la);
+ a0 ^= (d << lb);
+ a0 ^= (d << lc);
+
+ d = a3 >> rh;
+ a0 ^= d;
+ d <<= rh;
+
+ a0 ^= (d >> ra);
+ a0 ^= (d >> rb);
+ a0 ^= (d >> rc);
+ a3 ^= d;
+
+ c[3] = a3;
+ c[2] = a2;
+ c[1] = a1;
+ c[0] = a0;
+
+ return;
+}
+
+#else
+
+void fb_mulm_low(dig_t *c, dig_t *a, dig_t *b) {
+ align __m128i tab[16][8], tab1[16][8];
+ __m128i s0, m0, m1, m2, m3, m4, m8, m9, t0, t1, t2, *t;
+ align dig_t x[2];
+ char ta, tb;
+ int i, j, k;
+
+#define LOOKUP(i, T)\
+ T[0][i] = _mm_setzero_si128();\
+ T[1][i] = m0;\
+ T[2][i] = m1;\
+ T[3][i] = m9=_mm_xor_si128(m0,m1);\
+ T[4][i] = m2;\
+ T[5][i] = _mm_xor_si128(m2,m0);\
+ T[6][i] = _mm_xor_si128(m2,m1);\
+ T[7][i] = _mm_xor_si128(m2,m9);\
+ T[8][i] = m3;\
+ T[9][i] = _mm_xor_si128(m3,m0);\
+ T[10][i] = _mm_xor_si128(m3,m1);\
+ T[11][i] = _mm_xor_si128(m3,m9);\
+ T[12][i] = m2=_mm_xor_si128(m3,m2);\
+ T[13][i] = _mm_xor_si128(m2,m0);\
+ T[14][i] = _mm_xor_si128(m2,m1);\
+ T[15][i] = _mm_xor_si128(m2,m9);
+
+ s0 = _mm_setzero_si128();
+ for (i = 0; i < 2; i++) {
+ m0 = _mm_load_si128((__m128i *) (b + 2 * i));
+ m9 = _mm_srli_epi64(m0, 57);
+ m8 = _mm_slli_si128(m9, 8);
+ m8 = _mm_xor_si128(m8, s0);
+ s0 = _mm_srli_si128(m9, 8);
+ m1 = _mm_slli_epi64(m0, 1);
+ m2 = _mm_slli_epi64(m0, 2);
+ m3 = _mm_slli_epi64(m0, 3);
+ m3 = _mm_xor_si128(m3, _mm_srli_epi64(m8, 4));
+ m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 5));
+ m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 6));
+ LOOKUP(i, tab);
+ m4 = m0;
+ m0 = _mm_slli_epi64(m4, 4);
+ m1 = _mm_slli_epi64(m4, 5);
+ m2 = _mm_slli_epi64(m4, 6);
+ m3 = _mm_slli_epi64(m4, 7);
+ m3 = _mm_xor_si128(m3, m8);
+ m2 = _mm_xor_si128(m2, _mm_srli_epi64(m8, 1));
+ m1 = _mm_xor_si128(m1, _mm_srli_epi64(m8, 2));
+ m0 = _mm_xor_si128(m0, _mm_srli_epi64(m8, 3));
+ LOOKUP(i, tab1);
+ }
+ m3 = s0;
+ m2 = _mm_srli_epi64(s0, 1);
+ m1 = _mm_setzero_si128();
+ m0 = _mm_setzero_si128();
+ tab1[0][i] = tab1[1][i] = tab1[2][i] = tab1[3][i] = m9 = _mm_setzero_si128();
+ tab1[4][i] = tab1[5][i] = tab1[6][i] = tab1[7][i] = m2;
+ tab1[8][i] = tab1[9][i] = tab1[10][i] = tab1[11][i] = m3;
+ m2 =_mm_xor_si128(m3,m2);
+ tab1[12][i] = tab1[13][i] = tab1[14][i] = tab1[15][i] = m2;
+#undef LOOKUP
+
+#define LSHIFT8(m3,m2,m1,m0)\
+ m3=_mm_alignr_epi8(m3,m2,15);\
+ m2=_mm_alignr_epi8(m2,m1,15);\
+ m1=_mm_alignr_epi8(m1,m0,15);\
+ m0=_mm_slli_si128(m0,1);
+
+#define M(m2,m1,m0,ta,tb)\
+ ta &= 0x0f; tb &= 0x0f;\
+ m0=_mm_xor_si128(m0, tab[ta][0]);\
+ m1=_mm_xor_si128(m1, tab[ta][1]);\
+ m0=_mm_xor_si128(m0,tab1[tb][0]);\
+ m1=_mm_xor_si128(m1,tab1[tb][1]);\
+ m2=_mm_xor_si128(m2,tab1[tb][2]);\
+
+ // Main computation
+ m0 = m1 = m2 = m3 = _mm_setzero_si128();
+
+ for (j = 56; j >= 0; j -= 8) {
+ k = j + 4;
+ ta = (a[1] >> j);
+ tb = (a[1] >> k);
+ M(m2, m1, m0, ta, tb);
+ ta = (a[3] >> j);
+ tb = (a[3] >> k);
+ M(m3, m2, m1, ta, tb);
+ LSHIFT8(m3, m2, m1, m0);
+ }
+ for (j = 56; j >= 8; j -= 8) {
+ k = j + 4;
+ ta = (a[0] >> j);
+ tb = (a[0] >> k);
+ M(m2, m1, m0, ta, tb);
+ ta = (a[2] >> j);
+ tb = (a[2] >> k);
+ M(m3, m2, m1, ta, tb);
+ LSHIFT8(m3, m2, m1, m0);
+ }
+ ta = a[0];
+ tb = (a[0] >> 4);
+ M(m2, m1, m0, ta, tb);
+ ta = a[2];
+ tb = (a[2] >> 4);
+ M(m3, m2, m1, ta, tb);
+
+#undef M
+
+ REDUCE();
+ _mm_store_si128((__m128i *) c + 0, m0);
+ _mm_store_si128((__m128i *) x, m1);
+ c[2] = x[0];
+ c[3] = x[1] & 0x07FFFFFFFFFFFFFF;
+#undef M
+}
+
+#endif
diff --git a/src/low/curve2251-sse/relic_fb_mul_low_sf.c b/src/low/curve2251-sse/relic_fb_mul_low_sf.c
new file mode 100755
index 000000000..f157084bb
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_mul_low_sf.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright 2007 Project RELIC
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file.
+ *
+ * RELIC is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level binary field bit multiplication functions.
+ *
+ * @version $Id$
+ * @ingroup fb
+ */
+
+#include
+
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+#include "relic_bn_low.h"
+#include "relic_util.h"
+
+#include
+#include
+#include "macros.h"
+
+#define INV(A,B,C,D) D,C,B,A
+
+const align uint32_t tm[] = {
+ INV(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ INV(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100),
+ INV(0x1E1C1A18, 0x16141210, 0x0E0C0A08, 0x06040200),
+ INV(0x11121714, 0x1D1E1B18, 0x090A0F0C, 0x05060300),
+ INV(0x3C383430, 0x2C282420, 0x1C181410, 0x0C080400),
+ INV(0x3336393C, 0x27222D28, 0x1B1E1114, 0x0F0A0500),
+ INV(0x22242E28, 0x3A3C3630, 0x12141E18, 0x0A0C0600),
+ INV(0x2D2A2324, 0x31363F38, 0x15121B1C, 0x090E0700),
+ INV(0x78706860, 0x58504840, 0x38302820, 0x18100800),
+ INV(0x777E656C, 0x535A4148, 0x3F362D24, 0x1B120900),
+ INV(0x666C7278, 0x4E445A50, 0x363C2228, 0x1E140A00),
+ INV(0x69627F74, 0x454E5358, 0x313A272C, 0x1D160B00),
+ INV(0x44485C50, 0x74786C60, 0x24283C30, 0x14180C00),
+ INV(0x4B46515C, 0x7F726568, 0x232E3934, 0x171A0D00),
+ INV(0x5A544648, 0x626C7E70, 0x2A243638, 0x121C0E00),
+ INV(0x555A4B44, 0x69667778, 0x2D22333C, 0x111E0F00),
+};
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void fb_mul1_low(dig_t *c, dig_t *a, dig_t digit) {
+ int j, k;
+ dig_t b1, b2;
+
+ if (digit == 0) {
+ dv_zero(c, FB_DIGS + 1);
+ return;
+ }
+ if (digit == 1) {
+ fb_copy(c, a);
+ return;
+ }
+ c[FB_DIGS] = fb_lshb_low(c, a, util_bits_dig(digit) - 1);
+ for (int i = util_bits_dig(digit) - 2; i > 0; i--) {
+ if (digit & ((dig_t)1 << i)) {
+ j = FB_DIGIT - i;
+ b1 = a[0];
+ c[0] ^= (b1 << i);
+ for (k = 1; k < FB_DIGS; k++) {
+ b2 = a[k];
+ c[k] ^= ((b2 << i) | (b1 >> j));
+ b1 = b2;
+ }
+ c[FB_DIGS] ^= (b1 >> j);
+ }
+ }
+ if (digit & (dig_t)1) {
+ fb_add(c, c, a);
+ }
+}
+
+void fb_mulm_low(dig_t * c, dig_t * a, dig_t * b) {
+ __m128i rl[FB_DIGS], rh[FB_DIGS], l0, l1, h0, h1;
+ __m128i t0, t1, mask, m[FB_DIGS], m0, m1, m2, m3, m8, m9;
+ dig_t r0, r1, r2, r3;
+ int i, j, k, ta;
+ dig_t x[2];
+
+ mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
+
+ /* computes BL, BH and BM */
+ h0 = _mm_load_si128((__m128i *) & b[0]);
+ h1 = _mm_load_si128((__m128i *) & b[2]);
+
+#define LSHIFT4(m)\
+ m1=m[3];\
+ t1=_mm_srli_epi64(m1,60);\
+ for(j=3;j>0;j--){\
+ m1=_mm_slli_epi64(m1,4);\
+ t0=_mm_xor_si128(m1,_mm_slli_si128(t1,8));\
+ m1=m[j-1];\
+ t1=_mm_srli_epi64(m1,60);\
+ m[j]=_mm_xor_si128(t0,_mm_srli_si128(t1,8));\
+ }\
+ m[0]=_mm_slli_epi64(m1,4);\
+ m[0]=_mm_xor_si128(m[0],_mm_slli_si128(t1,8));
+
+#define LSHIFT8(m4,m3,m2,m1,m0)\
+ m3=_mm_alignr_epi8(m3,m2,15);\
+ m2=_mm_alignr_epi8(m2,m1,15);\
+ m1=_mm_alignr_epi8(m1,m0,15);\
+ m0=_mm_slli_si128(m0,1);
+
+#define LSHIFT8V(m)\
+ m[3]=_mm_alignr_epi8(m[3],m[2],15);\
+ m[2]=_mm_alignr_epi8(m[2],m[1],15);\
+ m[1]=_mm_alignr_epi8(m[1],m[0],15);\
+ m[0]=_mm_slli_si128(m[0],1);
+
+#define M(m1,m0,ta,b0,b1)\
+ t0 = _mm_load_si128((__m128i *)tm + ta);\
+ m0 = _mm_xor_si128(m0, _mm_shuffle_epi8(t0, b0));\
+ m1 = _mm_xor_si128(m1, _mm_shuffle_epi8(t0, b1));\
+
+#define MultK(al,b0,b1)\
+ m0=m1=m2=m3=_mm_setzero_si128();\
+ for(k=56;k>=0; k-=8){\
+ ta=(r1>>k)&0x0f;\
+ M(m1,m0,ta,b0,b1);\
+ ta=(r3>>k)&0x0f;\
+ M(m2,m1,ta,b0,b1);\
+ LSHIFT8(m4,m3,m2,m1,m0);\
+ }\
+ for(k=56;k>=8; k-=8){\
+ ta=(r0>>k)&0x0f;\
+ M(m1,m0,ta,b0,b1);\
+ ta=(r2>>k)&0x0f;\
+ M(m2,m1,ta,b0,b1);\
+ LSHIFT8(m4,m3,m2,m1,m0);\
+ }\
+ ta=(r0)&0xf; M(m1,m0,ta,b0,b1);\
+ ta=(r2)&0xf; M(m2,m1,ta,b0,b1);\
+
+ l0 = _mm_and_si128(h0, mask);
+ l1 = _mm_and_si128(h1, mask);
+
+ r0 = a[0];
+ r1 = a[1];
+ r2 = a[2];
+ r3 = a[3];
+
+ /* AL * BL */
+ MultK(a, l0, l1);
+
+ rl[0] = m0;
+ rl[1] = m1;
+ rl[2] = m2;
+ rl[3] = m3;
+
+ h0 = _mm_and_si128(_mm_srli_epi64(h0, 4), mask);
+ h1 = _mm_and_si128(_mm_srli_epi64(h1, 4), mask);
+
+ r0 >>= 4;
+ r1 >>= 4;
+ r2 >>= 4;
+ r3 >>= 4;
+
+ /* AH * BH */
+ MultK(ah, h0, h1);
+
+ rh[0] = m0;
+ rh[1] = m1;
+ rh[2] = m2;
+ rh[3] = m3;
+
+ h0 = _mm_xor_si128(h0, l0);
+ h1 = _mm_xor_si128(h1, l1);
+
+ r0 ^= a[0];
+ r1 ^= a[1];
+ r2 ^= a[2];
+ r3 ^= a[3];
+
+ /* AM * BM */
+ MultK(am, h0, h1);
+
+ m[0] = m0;
+ m[1] = m1;
+ m[2] = m2;
+ m[3] = m3;
+
+ /* m = m + rh + rl */
+ for (i = 0; i < FB_DIGS; i++) {
+ m[i] = _mm_xor_si128(m[i], rh[i]);
+ m[i] = _mm_xor_si128(m[i], rl[i]);
+ }
+
+ /* m= m + x^8 rh + x^4 m + rl */
+
+ LSHIFT4(m);
+ LSHIFT8V(rh);
+
+ for (i = 0; i < FB_DIGS; i++) {
+ m[i] = _mm_xor_si128(m[i], rh[i]);
+ m[i] = _mm_xor_si128(m[i], rl[i]);
+ }
+
+ m0 = m[0];
+ m1 = m[1];
+ m2 = m[2];
+ m3 = m[3];
+
+ REDUCE();
+ _mm_store_si128((__m128i *) c + 0, m0);
+ _mm_store_si128((__m128i *) x, m1);
+ c[2] = x[0];
+ c[3] = x[1] & 0x07FFFFFFFFFFFFFF;
+}
+
+void fb_muln_low(dig_t * c, dig_t * a, dig_t * b) {
+ __m128i rl[FB_DIGS], rh[FB_DIGS], l0, l1, h0, h1;
+ __m128i t0, t1, mask, m[FB_DIGS], m0, m1, m2, m3;
+ dig_t r0, r1, r2, r3;
+ int i, j, k, ta;
+
+ mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
+
+ /* computes BL, BH and BM */
+ h0 = _mm_load_si128((__m128i *) & b[0]);
+ h1 = _mm_load_si128((__m128i *) & b[2]);
+
+#define LSHIFT4(m)\
+ m1=m[3];\
+ t1=_mm_srli_epi64(m1,60);\
+ for(j=3;j>0;j--){\
+ m1=_mm_slli_epi64(m1,4);\
+ t0=_mm_xor_si128(m1,_mm_slli_si128(t1,8));\
+ m1=m[j-1];\
+ t1=_mm_srli_epi64(m1,60);\
+ m[j]=_mm_xor_si128(t0,_mm_srli_si128(t1,8));\
+ }\
+ m[0]=_mm_slli_epi64(m1,4);\
+ m[0]=_mm_xor_si128(m[0],_mm_slli_si128(t1,8));
+
+#define LSHIFT8(m4,m3,m2,m1,m0)\
+ m3=_mm_alignr_epi8(m3,m2,15);\
+ m2=_mm_alignr_epi8(m2,m1,15);\
+ m1=_mm_alignr_epi8(m1,m0,15);\
+ m0=_mm_slli_si128(m0,1);
+
+#define LSHIFT8V(m)\
+ m[3]=_mm_alignr_epi8(m[3],m[2],15);\
+ m[2]=_mm_alignr_epi8(m[2],m[1],15);\
+ m[1]=_mm_alignr_epi8(m[1],m[0],15);\
+ m[0]=_mm_slli_si128(m[0],1);
+
+#define M(m1,m0,ta,b0,b1)\
+ t0 = _mm_load_si128((__m128i *)tm + ta);\
+ m0 = _mm_xor_si128(m0, _mm_shuffle_epi8(t0, b0));\
+ m1 = _mm_xor_si128(m1, _mm_shuffle_epi8(t0, b1));\
+
+#define MultK(al,b0,b1)\
+ m0=m1=m2=m3=_mm_setzero_si128();\
+ for(k=56;k>=0; k-=8){\
+ ta=(r1>>k)&0x0f;\
+ M(m1,m0,ta,b0,b1);\
+ ta=(r3>>k)&0x0f;\
+ M(m2,m1,ta,b0,b1);\
+ LSHIFT8(m4,m3,m2,m1,m0);\
+ }\
+ for(k=56;k>=8; k-=8){\
+ ta=(r0>>k)&0x0f;\
+ M(m1,m0,ta,b0,b1);\
+ ta=(r2>>k)&0x0f;\
+ M(m2,m1,ta,b0,b1);\
+ LSHIFT8(m4,m3,m2,m1,m0);\
+ }\
+ ta=(r0)&0xf; M(m1,m0,ta,b0,b1);\
+ ta=(r2)&0xf; M(m2,m1,ta,b0,b1);\
+
+ l0 = _mm_and_si128(h0, mask);
+ l1 = _mm_and_si128(h1, mask);
+
+ r0 = a[0];
+ r1 = a[1];
+ r2 = a[2];
+ r3 = a[3];
+
+ /* AL * BL */
+ MultK(a, l0, l1);
+
+ rl[0] = m0;
+ rl[1] = m1;
+ rl[2] = m2;
+ rl[3] = m3;
+
+ h0 = _mm_and_si128(_mm_srli_epi64(h0, 4), mask);
+ h1 = _mm_and_si128(_mm_srli_epi64(h1, 4), mask);
+
+ r0 >>= 4;
+ r1 >>= 4;
+ r2 >>= 4;
+ r3 >>= 4;
+
+ /* AH * BH */
+ MultK(ah, h0, h1);
+
+ rh[0] = m0;
+ rh[1] = m1;
+ rh[2] = m2;
+ rh[3] = m3;
+
+ h0 = _mm_xor_si128(h0, l0);
+ h1 = _mm_xor_si128(h1, l1);
+
+ r0 ^= a[0];
+ r1 ^= a[1];
+ r2 ^= a[2];
+ r3 ^= a[3];
+
+ /* AM * BM */
+ MultK(am, h0, h1);
+
+ m[0] = m0;
+ m[1] = m1;
+ m[2] = m2;
+ m[3] = m3;
+
+ /* m = m + rh + rl */
+ for (i = 0; i < FB_DIGS; i++) {
+ m[i] = _mm_xor_si128(m[i], rh[i]);
+ m[i] = _mm_xor_si128(m[i], rl[i]);
+ }
+
+ /* m= m + x^8 rh + x^4 m + rl */
+
+ LSHIFT4(m);
+ LSHIFT8V(rh);
+
+ for (i = 0; i < FB_DIGS; i++) {
+ m[i] = _mm_xor_si128(m[i], rh[i]);
+ m[i] = _mm_xor_si128(m[i], rl[i]);
+ }
+
+ _mm_store_si128((__m128i *) c + 0, m[0]);
+ _mm_store_si128((__m128i *) c + 1, m[1]);
+ _mm_store_si128((__m128i *) c + 2, m[2]);
+ _mm_store_si128((__m128i *) c + 3, m[3]);
+}
diff --git a/src/low/curve2251-sse/relic_fb_rdc_low.c b/src/low/curve2251-sse/relic_fb_rdc_low.c
new file mode 100755
index 000000000..48879d53b
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_rdc_low.c
@@ -0,0 +1,158 @@
+/*
+ * RELIC is an Efficient LIbrary for Cryptography
+ * Copyright (C) 2007-2011 RELIC Authors
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file
+ * for contact information.
+ *
+ * RELIC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level modular reduction functions.
+ *
+ * @version $Id: relic_fb_rdc_low.c 194 2009-11-28 01:54:32Z dfaranha $
+ * @ingroup fb
+ */
+
+#include
+
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+#include "relic_util.h"
+#include "macros.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void fb_rdcn_low(dig_t *c, dig_t *a) {
+ dig_t d, a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
+ const int ra = 52;
+ const int rb = 55;
+ const int rc = 57;
+ const int rh = 59;
+ const int lh = 5;
+ const int la = 12;
+ const int lb = 9;
+ const int lc = 7;
+
+ d = a[7];
+ a4 ^= (d >> rh);
+ a4 ^= (d >> ra);
+ a4 ^= (d >> rb);
+ a4 ^= (d >> rc);
+
+ a3 ^= (d << lh);
+ a3 ^= (d << la);
+ a3 ^= (d << lb);
+ a3 ^= (d << lc);
+
+ d = a[6];
+ a3 ^= (d >> rh);
+ a3 ^= (d >> ra);
+ a3 ^= (d >> rb);
+ a3 ^= (d >> rc);
+
+ a2 ^= (d << lh);
+ a2 ^= (d << la);
+ a2 ^= (d << lb);
+ a2 ^= (d << lc);
+
+ d = a[5];
+ a2 ^= (d >> rh);
+ a2 ^= (d >> ra);
+ a2 ^= (d >> rb);
+ a2 ^= (d >> rc);
+
+ a1 ^= (d << lh);
+ a1 ^= (d << la);
+ a1 ^= (d << lb);
+ a1 ^= (d << lc);
+
+ d = a4;
+ a1 ^= (d >> rh);
+ a1 ^= (d >> ra);
+ a1 ^= (d >> rb);
+ a1 ^= (d >> rc);
+
+ a0 ^= (d << lh);
+ a0 ^= (d << la);
+ a0 ^= (d << lb);
+ a0 ^= (d << lc);
+
+ d = a3 >> rh;
+ a0 ^= d;
+ d <<= rh;
+
+ a0 ^= (d >> ra);
+ a0 ^= (d >> rb);
+ a0 ^= (d >> rc);
+ a3 ^= d;
+
+ c[3] = a3;
+ c[2] = a2;
+ c[1] = a1;
+ c[0] = a0;
+}
+
+void fb_rdc1_low(dig_t *c, dig_t *a) {
+ dig_t d;
+ const int fa = 7;
+ const int fb = 4;
+ const int fc = 2;
+
+ const int rh = FB_BITS % FB_DIGIT;
+ const int sh = FB_BITS / FB_DIGIT + 1;
+ const int lh = FB_DIGIT - rh;;
+ const int ra = (FB_BITS - fa) % FB_DIGIT;
+ const int sa = (FB_BITS - fa) / FB_DIGIT + 1;
+ const int la = FB_DIGIT - ra;
+ const int rb = (FB_BITS - fb) % FB_DIGIT;
+ const int sb = (FB_BITS - fb) / FB_DIGIT + 1;
+ const int lb = FB_DIGIT - rb;
+ const int rc = (FB_BITS - fc) % FB_DIGIT;
+ const int sc = (FB_BITS - fc) / FB_DIGIT + 1;
+ const int lc = FB_DIGIT - rc;
+
+ d = a[FB_DIGS];
+
+ a[FB_DIGS - sh] ^= (d << lh);
+ a[FB_DIGS - sa + 1] ^= (d >> ra);
+ a[FB_DIGS - sa] ^= (d << la);
+
+ a[FB_DIGS - sb + 1] ^= (d >> rb);
+ a[FB_DIGS - sb] ^= (d << lb);
+ a[FB_DIGS - sc + 1] ^= (d >> rc);
+ a[FB_DIGS - sc] ^= (d << lc);
+
+ d = a[sh - 1] >> rh;
+
+ dig_t a0 = a[0];
+ a0 ^= d;
+ d <<= rh;
+
+ a0 ^= (d >> ra);
+ a0 ^= (d >> rb);
+ a0 ^= (d >> rc);
+ a[3] ^= d;
+
+ c[0] = a0;
+ c[1] = a[1];
+ c[2] = a[2];
+ c[3] = a[3];
+}
diff --git a/src/low/curve2251-sse/relic_fb_slv_low.c b/src/low/curve2251-sse/relic_fb_slv_low.c
new file mode 100755
index 000000000..4825f527f
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_slv_low.c
@@ -0,0 +1,193 @@
+/*
+ * RELIC is an Efficient LIbrary for Cryptography
+ * Copyright (C) 2007-2011 RELIC Authors
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file
+ * for contact information.
+ *
+ * RELIC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level binary field half-trace.
+ *
+ * @version $Id: relic_fb_slv_low.c 652 2011-02-20 23:50:00Z dfaranha $
+ * @ingroup fb
+ */
+
+#include
+#include
+#ifdef __PCLMUL__
+#include
+#endif
+
+#include
+
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+#include "relic_util.h"
+
+/*============================================================================*/
+/* Private definitions */
+/*============================================================================*/
+
+void fb_slvn_low(dig_t *c, dig_t *a) {
+ int i;
+ dig_t *p, u0, u1, u2, u3;
+ void *tab = fb_poly_get_slv();
+ __m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm;
+
+ perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200);
+ mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
+ mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0);
+ mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
+ sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100);
+ sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400);
+
+ t0 = _mm_load_si128((__m128i *)a);
+ t1 = _mm_load_si128((__m128i *)(a + 2));
+ r0 = r1 = _mm_setzero_si128();
+
+ m0 = _mm_shuffle_epi8(t1, perm);
+ m1 = _mm_and_si128(m0, mask0);
+ m2 = _mm_and_si128(m0, mask1);
+ m2 = _mm_srli_epi64(m2, 4);
+ m2 = _mm_shuffle_epi8(sqrt1, m2);
+ m1 = _mm_shuffle_epi8(sqrt0, m1);
+ m1 = _mm_xor_si128(m1, m2);
+
+ m2 = _mm_slli_si128(m1, 8);
+ m1 = _mm_and_si128(m1, mask2);
+ m1 = _mm_slli_epi64(m1, 4);
+ m1 = _mm_xor_si128(m1, m2);
+ t0 = _mm_xor_si128(t0, m1);
+ r0 = _mm_xor_si128(r0, m1);
+
+ m0 = _mm_and_si128(t0, mask2);
+ m0 = _mm_shuffle_epi8(m0, perm);
+ m1 = _mm_and_si128(m0, mask0);
+ m2 = _mm_and_si128(m0, mask1);
+ m2 = _mm_srli_epi64(m2, 4);
+ m2 = _mm_shuffle_epi8(sqrt1, m2);
+ m1 = _mm_shuffle_epi8(sqrt0, m1);
+ m1 = _mm_xor_si128(m1, m2);
+
+ m2 = _mm_srli_si128(m1, 8);
+ m1 = _mm_andnot_si128(mask2, m1);
+ m2 = _mm_slli_epi64(m2, 4);
+ m1 = _mm_xor_si128(m1, m2);
+ t0 = _mm_xor_si128(t0, m1);
+ r0 = _mm_xor_si128(r0, m1);
+
+ m1 = _mm_srli_si128(t0, 4);
+ m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF));
+ m0 = _mm_shuffle_epi8(m1, perm);
+ m1 = _mm_and_si128(m0, mask0);
+ m2 = _mm_and_si128(m0, mask1);
+ m2 = _mm_srli_epi64(m2, 4);
+ m2 = _mm_shuffle_epi8(sqrt1, m2);
+ m1 = _mm_shuffle_epi8(sqrt0, m1);
+ m1 = _mm_xor_si128(m1, m2);
+ m2 = _mm_slli_si128(m1, 8);
+ m1 = _mm_slli_epi64(m1, 4);
+ m1 = _mm_xor_si128(m1, m2);
+ m1 = _mm_srli_si128(m1, 6);
+ t0 = _mm_xor_si128(t0, m1);
+ r0 = _mm_xor_si128(r0, m1);
+
+ m1 = _mm_srli_si128(t0, 2);
+ m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF));
+ m0 = _mm_shuffle_epi8(m1, perm);
+ m1 = _mm_and_si128(m0, mask0);
+ m2 = _mm_and_si128(m0, mask1);
+ m2 = _mm_srli_epi64(m2, 4);
+ m2 = _mm_shuffle_epi8(sqrt1, m2);
+ m1 = _mm_shuffle_epi8(sqrt0, m1);
+ m1 = _mm_xor_si128(m1, m2);
+ m2 = _mm_slli_si128(m1, 8);
+ m1 = _mm_slli_epi64(m1, 4);
+ m1 = _mm_xor_si128(m1, m2);
+ m1 = _mm_srli_si128(m1, 7);
+ t0 = _mm_xor_si128(t0, m1);
+ r0 = _mm_xor_si128(r0, m1);
+
+ m1 = _mm_srli_si128(t0, 1);
+ m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55));
+ m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1));
+ m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33));
+ m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2));
+ m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F));
+ m1 = _mm_slli_epi64(m1, 4);
+ t0 = _mm_xor_si128(t0, m1);
+ r0 = _mm_xor_si128(r0, m1);
+
+ m1 = _mm_srli_epi64(t0, 4);
+ m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5));
+ m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1));
+ m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3));
+ m1 = _mm_slli_epi64(m1, 2);
+ t0 = _mm_xor_si128(t0, m1);
+ r0 = _mm_xor_si128(r0, m1);
+
+ m1 = _mm_srli_epi64(t0, 2);
+ m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1));
+ m1 = _mm_slli_epi64(m1, 1);
+ t0 = _mm_xor_si128(t0, m1);
+ r0 = _mm_xor_si128(r0, m1);
+
+ sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000);
+ sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000);
+
+ m1 = _mm_and_si128(t0, mask0);
+ m2 = _mm_and_si128(t0, mask1);
+ m3 = _mm_and_si128(t1, mask0);
+ m4 = _mm_and_si128(t1, mask1);
+ m2 = _mm_srli_epi64(m2, 4);
+ m4 = _mm_srli_epi64(m4, 4);
+ m2 = _mm_shuffle_epi8(sqrt1, m2);
+ m1 = _mm_shuffle_epi8(sqrt0, m1);
+ m4 = _mm_shuffle_epi8(sqrt1, m4);
+ m3 = _mm_shuffle_epi8(sqrt0, m3);
+ m1 = _mm_or_si128(m1, m2);
+ u0 = _mm_extract_epi64(m1, 0);
+ u1 = _mm_extract_epi64(m1, 1);
+ m3 = _mm_or_si128(m3, m4);
+ u2 = _mm_extract_epi64(m3, 0);
+ u3 = _mm_extract_epi64(m3, 1);
+
+ for (i = 0; i < 8; i++) {
+ p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st));
+ r0 = _mm_xor_si128(r0, *(__m128i *)(p));
+ r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
+ u0 >>= 8;
+ p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st));
+ r0 = _mm_xor_si128(r0, *(__m128i *)(p));
+ r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
+ u1 >>= 8;
+ p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st));
+ r0 = _mm_xor_si128(r0, *(__m128i *)(p));
+ r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
+ u2 >>= 8;
+ p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st));
+ r0 = _mm_xor_si128(r0, *(__m128i *)(p));
+ r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
+ u3 >>= 8;
+ }
+
+ _mm_store_si128((__m128i *)c, r0);
+ _mm_store_si128((__m128i *)(c + 2), r1);
+}
diff --git a/src/low/curve2251-sse/relic_fb_sqr_low.c b/src/low/curve2251-sse/relic_fb_sqr_low.c
new file mode 100755
index 000000000..692793da3
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_sqr_low.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright 2007 Project RELIC
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file.
+ *
+ * RELIC is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level binary field squaring.
+ *
+ * @version $Id$
+ * @ingroup bn
+ */
+
+#include
+
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+#include "relic_util.h"
+#include "macros.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+#include
+
+void fb_sqrn_low(dig_t *c, dig_t *a) {
+ __m128i m0, t;
+ int j = 0;
+
+ for (int i = 0; i < FB_DIGS; i++) {
+ m0 = _mm_set_epi32(0x00000000, a[i] >> 32, 0x00000000, a[i] & 0xFFFFFFFFFFFFFFFF);
+ t = _mm_slli_epi64(m0, 16);
+ t = _mm_or_si128(t, m0);
+ t = _mm_and_si128(t, _mm_set_epi32(0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF));
+ m0 = t;
+ t = _mm_slli_epi64(m0, 8);
+ t = _mm_or_si128(t, m0);
+ t = _mm_and_si128(t, _mm_set_epi32(0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF));
+ m0 = t;
+ t = _mm_slli_epi64(m0, 4);
+ t = _mm_or_si128(t, m0);
+ t = _mm_and_si128(t, _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F));
+ m0 = t;
+ t = _mm_slli_epi64(m0, 2);
+ t = _mm_or_si128(t, m0);
+ t = _mm_and_si128(t, _mm_set_epi32(0x33333333, 0x33333333, 0x33333333, 0x33333333));
+ m0 = t;
+ t = _mm_slli_epi64(m0, 1);
+ t = _mm_or_si128(t, m0);
+ t = _mm_and_si128(t, _mm_set_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555));
+ _mm_store_si128((__m128i *)(c + j), t);
+ j += 2;
+ }
+}
+
+void fb_sqrl_low(dig_t *c, dig_t *a) {
+ __m128i m0, m1, m2, m3, m4, m5, m6, mask;
+ __m128i t0;
+
+ t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100);
+ mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
+
+ m0 = _mm_load_si128((__m128i *)(a));
+ m1 = _mm_and_si128(m0, mask);
+ m1 = _mm_shuffle_epi8(t0, m1);
+ m2 = _mm_srli_epi64(m0, 4);
+ m2 = _mm_and_si128(m2, mask);
+ m2 = _mm_shuffle_epi8(t0, m2);
+ m3 = _mm_unpacklo_epi8(m1, m2);
+ m4 = _mm_unpackhi_epi8(m1, m2);
+
+ m0 = _mm_load_si128((__m128i *)(a+2));
+ m1 = _mm_and_si128(m0, mask);
+ m1 = _mm_shuffle_epi8(t0, m1);
+ m2 = _mm_srli_epi64(m0, 4);
+ m2 = _mm_and_si128(m2, mask);
+ m2 = _mm_shuffle_epi8(t0, m2);
+ m5 = _mm_unpacklo_epi8(m1, m2);
+ m6 = _mm_unpackhi_epi8(m1, m2);
+
+ _mm_store_si128((__m128i *)(c + 0), m3);
+ _mm_store_si128((__m128i *)(c + 2), m4);
+ _mm_store_si128((__m128i *)(c + 4), m5);
+ _mm_store_si128((__m128i *)(c + 6), m6);
+}
+
+#if defined(__INTEL_COMPILER)
+
+void fb_sqrm_low(dig_t *c, dig_t *a) {
+ __m128i t0, t1, m0, m1, m2, m3, m4, m5, m6, m8, m9, mask;
+ align dig_t x[2];
+
+ t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100);
+ mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
+
+ m0 = _mm_load_si128((__m128i *)(a));
+ m1 = _mm_and_si128(m0, mask);
+ m1 = _mm_shuffle_epi8(t0, m1);
+ m2 = _mm_srli_epi64(m0, 4);
+ m2 = _mm_and_si128(m2, mask);
+ m2 = _mm_shuffle_epi8(t0, m2);
+ m3 = _mm_unpacklo_epi8(m1, m2);
+ m4 = _mm_unpackhi_epi8(m1, m2);
+
+ m0 = _mm_load_si128((__m128i *)(a+2));
+ m1 = _mm_and_si128(m0, mask);
+ m1 = _mm_shuffle_epi8(t0, m1);
+ m2 = _mm_srli_epi64(m0, 4);
+ m2 = _mm_and_si128(m2, mask);
+ m2 = _mm_shuffle_epi8(t0, m2);
+ m5 = _mm_unpacklo_epi8(m1, m2);
+ m6 = _mm_unpackhi_epi8(m1, m2);
+
+ m0 = m3;
+ m1 = m4;
+ m2 = m5;
+ m3 = m6;
+
+ REDUCE();
+ _mm_store_si128((__m128i *) c + 0, m0);
+ _mm_store_si128((__m128i *) x, m1);
+ c[2] = x[0];
+ c[3] = x[1] & 0x07FFFFFFFFFFFFFF;
+}
+
+#else
+
+void fb_sqrm_low(dig_t *c, dig_t *a) {
+ __m128i t0, m0, m1, m2, m3, m4, m5, m6, mask;
+ align dig_t t[2*FB_DIGS];
+
+ t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100);
+ mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
+
+ m0 = _mm_load_si128((__m128i *)(a));
+ m1 = _mm_and_si128(m0, mask);
+ m1 = _mm_shuffle_epi8(t0, m1);
+ m2 = _mm_srli_epi64(m0, 4);
+ m2 = _mm_and_si128(m2, mask);
+ m2 = _mm_shuffle_epi8(t0, m2);
+ m3 = _mm_unpacklo_epi8(m1, m2);
+ m4 = _mm_unpackhi_epi8(m1, m2);
+
+ m0 = _mm_load_si128((__m128i *)(a+2));
+ m1 = _mm_and_si128(m0, mask);
+ m1 = _mm_shuffle_epi8(t0, m1);
+ m2 = _mm_srli_epi64(m0, 4);
+ m2 = _mm_and_si128(m2, mask);
+ m2 = _mm_shuffle_epi8(t0, m2);
+ m5 = _mm_unpacklo_epi8(m1, m2);
+ m6 = _mm_unpackhi_epi8(m1, m2);
+
+ m0 = m3;
+ m1 = m4;
+ m2 = m5;
+ m3 = m6;
+
+ _mm_store_si128((__m128i *) t + 0, m0);
+ _mm_store_si128((__m128i *) t + 1, m1);
+ _mm_store_si128((__m128i *) t + 2, m2);
+ _mm_store_si128((__m128i *) t + 3, m3);
+
+ const int ra = 52;
+ const int rb = 55;
+ const int rc = 57;
+ const int rh = 59;
+ const int lh = 5;
+ const int la = 12;
+ const int lb = 9;
+ const int lc = 7;
+
+ dig_t d = t[7], a0 = t[0], a1 = t[1], a2 = t[2], a3 = t[3], a4 = t[4];
+
+ a4 ^= (d >> rh);
+ a4 ^= (d >> ra);
+ a4 ^= (d >> rb);
+ a4 ^= (d >> rc);
+
+ a3 ^= (d << lh);
+ a3 ^= (d << la);
+ a3 ^= (d << lb);
+ a3 ^= (d << lc);
+
+ d = t[6];
+ a3 ^= (d >> rh);
+ a3 ^= (d >> ra);
+ a3 ^= (d >> rb);
+ a3 ^= (d >> rc);
+
+ a2 ^= (d << lh);
+ a2 ^= (d << la);
+ a2 ^= (d << lb);
+ a2 ^= (d << lc);
+
+ d = t[5];
+ a2 ^= (d >> rh);
+ a2 ^= (d >> ra);
+ a2 ^= (d >> rb);
+ a2 ^= (d >> rc);
+
+ a1 ^= (d << lh);
+ a1 ^= (d << la);
+ a1 ^= (d << lb);
+ a1 ^= (d << lc);
+
+ d = a4;
+ a1 ^= (d >> rh);
+ a1 ^= (d >> ra);
+ a1 ^= (d >> rb);
+ a1 ^= (d >> rc);
+
+ a0 ^= (d << lh);
+ a0 ^= (d << la);
+ a0 ^= (d << lb);
+ a0 ^= (d << lc);
+
+ d = a3 >> rh;
+ a0 ^= d;
+ d <<= rh;
+
+ a0 ^= (d >> ra);
+ a0 ^= (d >> rb);
+ a0 ^= (d >> rc);
+ a3 ^= d;
+
+ c[3] = a3;
+ c[2] = a2;
+ c[1] = a1;
+ c[0] = a0;
+
+ return;
+}
+
+#endif
diff --git a/src/low/curve2251-sse/relic_fb_srt_low.c b/src/low/curve2251-sse/relic_fb_srt_low.c
new file mode 100755
index 000000000..7bc0d8f9f
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_srt_low.c
@@ -0,0 +1,184 @@
+/*
+ * RELIC is an Efficient LIbrary for Cryptography
+ * Copyright (C) 2007-2011 RELIC Authors
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file
+ * for contact information.
+ *
+ * RELIC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level binary field square root.
+ *
+ * @version $Id: relic_fb_srt_low.c 207 2009-12-25 20:15:28Z dfaranha $
+ * @ingroup fb
+ */
+
+#include
+#include
+#ifdef __PCLMUL__
+#include
+#endif
+
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+#include "relic_util.h"
+#include "macros.h"
+
+/*============================================================================*/
+/* Private definitions */
+/*============================================================================*/
+
+#define HALF ((int)((FB_BITS / 2)/(FB_DIGIT) + ((FB_BITS / 2) % FB_DIGIT > 0)))
+
+#ifndef __PCLMUL__
+
+void fb_mulh_low(dig_t *c, dig_t *a) {
+ __m128i m0, m1, m2, m3, m8, m9, t0, t1;
+ unsigned char ta;
+ int j;
+ align dig_t x[2];
+ dig_t *tab;
+
+#define LSHIFT8(m2,m1,m0)\
+ m2=_mm_alignr_epi8(m2,m1,15);\
+ m1=_mm_alignr_epi8(m1,m0,15);\
+ m0=_mm_slli_si128(m0,1);
+
+#define M(m1,m0,ta)\
+ tab = fb_poly_tab_srz(ta);\
+ m0=_mm_xor_si128(m0, ((__m128i *)tab)[0]);\
+ m1=_mm_xor_si128(m1, ((__m128i *)tab)[1]);\
+
+ // Main computation
+ m0 = m1 = m2 = m3 = _mm_setzero_si128();
+
+ for (j = 56; j >= 0; j -= 8) {
+ ta = (a[1] >> j) & 0xFF;
+ M(m1, m0, ta);
+ LSHIFT8(m2, m1, m0);
+ }
+ for (j = 56; j >= 8; j -= 8) {
+ ta = (a[0] >> j) & 0xFF;
+ M(m1, m0, ta);
+ LSHIFT8(m2, m1, m0);
+ }
+ ta = a[0] & 0xFF;
+ M(m1, m0, ta);
+
+ RED251(m2,m1,m0); \
+ m8 = _mm_srli_si128(m1,8); \
+ m9 = _mm_srli_epi64(m8,59); \
+ m9 = _mm_slli_epi64(m9,59); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,59)); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,57)); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,55)); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,52)); \
+
+ _mm_store_si128((__m128i *) c + 0, m0);
+ _mm_store_si128((__m128i *) x, m1);
+ c[2] = x[0];
+ c[3] = x[1] & 0x07FFFFFFFFFFFFFF;
+#undef M
+}
+
+#else
+
+void fb_mulh_low(dig_t *c, dig_t *a) {
+ __m128i ma0, ma1, mb0, mb1, m0, m1, m2, m3, m4, m5, m8, m9, t0, t1, t2, t3;
+ dig_t *b = fb_poly_get_srz();
+
+ ma0 = _mm_load_si128((__m128i *)a);
+ mb0 = _mm_load_si128((__m128i *)b);
+ mb1 = _mm_load_si128((__m128i *)b + 1);
+
+ MUL(ma0, mb0);
+ m0 = t0;
+ m1 = t1;
+
+ mb0 = XOR(mb0, mb1);
+
+ MUL(ma0, mb0);
+ m4 = _mm_xor_si128(t0, m0);
+ m2 = _mm_xor_si128(t1, m1);
+
+ m1 = XOR(m1, m4);
+
+ align dig_t _x[2];
+
+ RED251(m2,m1,m0); \
+ m8 = _mm_srli_si128(m1,8); \
+ m9 = _mm_srli_epi64(m8,59); \
+ m9 = _mm_slli_epi64(m9,59); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,59)); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,57)); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,55)); \
+ m0 = _mm_xor_si128(m0,_mm_srli_epi64(m9,52)); \
+ _mm_store_si128((__m128i *) c + 0, m0);
+ _mm_store_si128((__m128i *) _x, m1);
+ c[2] = _x[0];
+ c[3] = _x[1] & 0x07FFFFFFFFFFFFFF;
+ return;
+#undef M
+}
+
+#endif
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+void fb_srtn_low(dig_t *c, dig_t *a) {
+ __m128i m0, m1, m2, perm, mask0, mask1, sqrt0, sqrt1;
+ align dig_t x[2], d0, d1;
+ align dig_t t_e[FB_DIGS] = {0}, t_o[FB_DIGS] = {0};
+ int i, n;
+
+ //sqrt1 = sqrt0<<2
+ sqrt0 = _mm_set_epi32(0x33322322, 0x31302120, 0x13120302, 0x11100100);
+ sqrt1 = _mm_set_epi32(0xccc88c88, 0xc4c08480, 0x4c480c08, 0x44400400);
+ perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200);
+ mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0);
+ mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
+
+ n = 0;
+ for (i = 0; i < FB_DIGS; i += 2) {
+ m1 = _mm_load_si128((__m128i *) & a[i]);
+ m0 = _mm_shuffle_epi8(m1, perm);
+ m1 = _mm_and_si128(m0, mask0);
+ m2 = _mm_and_si128(m0, mask1);
+ m2 = _mm_srli_epi64(m2, 4);
+ m2 = _mm_shuffle_epi8(sqrt1, m2);
+ m1 = _mm_shuffle_epi8(sqrt0, m1);
+ m2 = _mm_or_si128(m1, m2);
+ m0 = _mm_and_si128(m2, mask0);
+ m1 = _mm_and_si128(m2, mask1);
+ _mm_store_si128((__m128i *) x, m0);
+ d0 = x[0] | (x[1] << 4);
+ _mm_store_si128((__m128i *) x, m1);
+ d1 = x[1] | (x[0] >> 4);
+ t_e[n] = d0;
+ t_o[n] = d1;
+ n++;
+ }
+
+ fb_mulh_low(c, t_o);
+ for (i = 0; i < HALF; i++) {
+ c[i] ^= t_e[i];
+ }
+}
diff --git a/src/low/curve2251-sse/relic_fb_trc_low.c b/src/low/curve2251-sse/relic_fb_trc_low.c
new file mode 100644
index 000000000..6b251747e
--- /dev/null
+++ b/src/low/curve2251-sse/relic_fb_trc_low.c
@@ -0,0 +1,41 @@
+/*
+ * RELIC is an Efficient LIbrary for Cryptography
+ * Copyright (C) 2007-2011 RELIC Authors
+ *
+ * This file is part of RELIC. RELIC is legal property of its developers,
+ * whose names are not listed here. Please refer to the COPYRIGHT file
+ * for contact information.
+ *
+ * RELIC is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * RELIC is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with RELIC. If not, see .
+ */
+
+/**
+ * @file
+ *
+ * Implementation of the low-level trace function.
+ *
+ * @version $Id: relic_fb_slv_low.c 652 2011-02-20 23:50:00Z dfaranha $
+ * @ingroup fb
+ */
+
+#include "relic_fb.h"
+#include "relic_fb_low.h"
+
+/*============================================================================*/
+/* Public definitions */
+/*============================================================================*/
+
+dig_t fb_trcn_low(dig_t *a) {
+ return (a[0] ^(a[3] >> 55) ^(a[3] >> 57)) & 0x01;
+}
diff --git a/test/test_fb.c b/test/test_fb.c
index 4560bc7d8..07587ea98 100644
--- a/test/test_fb.c
+++ b/test/test_fb.c
@@ -690,15 +690,21 @@ static int reduction(void) {
dv_new(t1);
TEST_BEGIN("modular reduction is correct") {
- fb_rand(a);
- /* Test if a * f(z) mod f(z) == 0. */
- fb_mul(b, a, fb_poly_get());
if (FB_POLYN % FB_DIGIT == 0) {
+ /* Test if a * f(z) mod f(z) == 0. */
+ fb_rand(a);
+ fb_mul(b, a, fb_poly_get());
fb_copy(t0, b);
fb_copy(t0 + FB_DIGS, a);
- fb_rdc(b, t0);
+ fb_rdc(a, t0);
+ } else {
+ /* Test if f(z) * z^(m-1) mod f(z) == 0. */
+ dv_zero(t0, FB_DIGS);
+ t0[FB_DIGS - 1] = (dig_t)1 << (FB_DIGIT - 1);
+ fb_rsh(t0 + FB_DIGS, fb_poly_get(), 1);
+ fb_rdc(a, t0);
}
- TEST_ASSERT(fb_is_zero(b) == 1, end);
+ TEST_ASSERT(fb_is_zero(a) == 1, end);
}
TEST_END;