separated avx2 code by architecture

6 years ago · 3b9ce8d41d
parent 11753f5d7e
commit 3b9ce8d41d
3 changed files with 43 additions and 4 deletions
--- a/crypto/libntrup/src/avx/mult.c
+++ b/crypto/libntrup/src/avx/mult.c
@ -5,17 +5,44 @@
 #include "rq.h"
 #include "r3.h"

+// 32-bit hosts: use compiler builtins and let compiler
+// perform register allocation and/or spilling to core
+#ifndef __amd64__
 #define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \
  gj = g[j]; \
  h0 += f0 * gj; \
-  _mm256_storeu_ps(&h[i + j],h0); \
+  _mm256_storeu_ps((float*)&h[i + j],h0); \
  h1 += f1 * gj; \
  h2 += f2 * gj; \
  h3 += f3 * gj; \
  h4 += f4 * gj; \
-  h0 = _mm256_loadu_ps(&h[i + j + 5]); \
+  h0 = _mm256_loadu_ps((float*)&h[i + j + 5]); \
  h0 += f5 * gj;

+#define MULSTEP MULSTEP_gcc
+
+#define MULSTEP_noload(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  h0 += gj*f0; \
+  _mm256_storeu_ps((float*)&h[i+j], h0); \
+  h1 += gj*f1; \
+  h2 += gj*f2; \
+  h3 += gj*f3; \
+  h4 += gj*f4; \
+  h0 = gj* f5;
+
+#define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  h0 = gj*f0; \
+  _mm256_storeu_ps((float*)&h[i+j], h0); \
+  h1 = gj*f1; \
+  h2 = gj*f2; \
+  h3 = gj*f3; \
+  h4 = gj*f4; \
+  h0 = gj*f5;
+#else
+// 64-bit hosts: use inline asm as before
+#define MULSTEP MULSTEP_asm
 #define MULSTEP_asm(j,h0,h1,h2,h3,h4) \
  gj = g[j]; \
  __asm__( \
@ -30,8 +57,6 @@
    : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
    : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]),"m"(h[i+j+5]));

-#define MULSTEP MULSTEP_asm
-
 #define MULSTEP_noload(j,h0,h1,h2,h3,h4) \
  gj = g[j]; \
  __asm__( \
@ -57,6 +82,7 @@
    "vmulps %5,%11,%0 \n\t" \
    : "=&x"(h0),"=&x"(h1),"=&x"(h2),"=&x"(h3),"=&x"(h4) \
    : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
+#endif

 static inline __m128i _mm_load_cvtepi8_epi16(const long long *x)
 {
--- a/crypto/libntrup/src/avx/rq.c
+++ b/crypto/libntrup/src/avx/rq.c
@ -21,6 +21,12 @@
 #define broadcast(r) _mm256_set1_pd(r)
 #define floor(x) _mm256_floor_pd(x)

+// 32-bit hosts only
+#ifndef __amd64__
+#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
+                                                  __a[N];}))
+#endif
+
 void rq_encode(unsigned char *c,const modq *f)
 {
  crypto_int32 f0, f1, f2, f3, f4;
--- a/crypto/libntrup/src/avx/rq_mod3.c
+++ b/crypto/libntrup/src/avx/rq_mod3.c
@ -1,5 +1,6 @@
 #if __AVX2__
 #include <immintrin.h>
+#include <smmintrin.h>
 #include "mod3.h"
 #include "rq.h"

@ -9,6 +10,12 @@
 #define v4591_16 _mm256_set1_epi16(4591)
 #define v10923_16 _mm256_set1_epi16(10923)

+// 32-bit hosts only
+#ifndef __amd64__
+#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
+                                                  __a[N];}))
+#endif
+
 static inline __m256i squeeze(__m256i x)
 {
  __m256i q = _mm256_mulhrs_epi16(x,v7);