From e3a94101b4757e3ec9a3981e58882e75c40bba0d Mon Sep 17 00:00:00 2001 From: despair86 Date: Wed, 26 Sep 2018 21:19:34 -0500 Subject: [PATCH] the AVX2 codepaths now appear to be 32-bit clean. old hard-coded inline asm is still included if requested. -rick nb: is a vector of eight floats not the same layout as a simple linear array of same? (Aside from the alignment requirements) netbsd-family build fixes, also - the AVX2 codepaths are _compiler-specific_, they use features _exclusive_ to gcc and clang --- crypto/libntrup/src/avx/mult.c | 26 ++++++++++++++++--- crypto/libntrup/src/avx/rq.c | 5 ++++ crypto/libntrup/src/avx/rq_mod3.c | 6 +++++ include/llarp/ip.hpp | 15 +++-------- vendor/cppbackport-master/lib/fs/diriter.cpp | 2 ++ vendor/libtuntap-master/tuntap-unix-openbsd.c | 1 - 6 files changed, 39 insertions(+), 16 deletions(-) diff --git a/crypto/libntrup/src/avx/mult.c b/crypto/libntrup/src/avx/mult.c index e01e1cd33..b0e9e609d 100644 --- a/crypto/libntrup/src/avx/mult.c +++ b/crypto/libntrup/src/avx/mult.c @@ -8,12 +8,12 @@ #define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \ gj = g[j]; \ h0 += f0 * gj; \ - _mm256_storeu_ps(&h[i + j],h0); \ + _mm256_storeu_ps((float*)&h[i + j],h0); \ h1 += f1 * gj; \ h2 += f2 * gj; \ h3 += f3 * gj; \ h4 += f4 * gj; \ - h0 = _mm256_loadu_ps(&h[i + j + 5]); \ + h0 = _mm256_loadu_ps((float*)&h[i + j + 5]); \ h0 += f5 * gj; #define MULSTEP_asm(j,h0,h1,h2,h3,h4) \ @@ -30,9 +30,19 @@ : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \ : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]),"m"(h[i+j+5])); -#define MULSTEP MULSTEP_asm +#define MULSTEP MULSTEP_gcc #define MULSTEP_noload(j,h0,h1,h2,h3,h4) \ + gj = g[j]; \ + h0 += gj*f0; \ + _mm256_storeu_ps((float*)&h[i+j], h0); \ + h1 += gj*f1; \ + h2 += gj*f2; \ + h3 += gj*f3; \ + h4 += gj*f4; \ + h0 = gj* f5; + +#define MULSTEP_noload_asm(j,h0,h1,h2,h3,h4) \ gj = g[j]; \ __asm__( \ "vfmadd231ps %5,%6,%0 \n\t" \ @@ -46,6 +56,16 @@ : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j])); #define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \ + gj = g[j]; \ + h0 = gj*f0; \ + _mm256_storeu_ps((float*)&h[i+j], h0); \ + h1 = gj*f1; \ + h2 = gj*f2; \ + h3 = gj*f3; \ + h4 = gj*f4; \ + h0 = gj*f5; + +#define MULSTEP_fromzero_asm(j,h0,h1,h2,h3,h4) \ gj = g[j]; \ __asm__( \ "vmulps %5,%6,%0 \n\t" \ diff --git a/crypto/libntrup/src/avx/rq.c b/crypto/libntrup/src/avx/rq.c index b6dc8ab6c..55e1c132d 100644 --- a/crypto/libntrup/src/avx/rq.c +++ b/crypto/libntrup/src/avx/rq.c @@ -21,6 +21,11 @@ #define broadcast(r) _mm256_set1_pd(r) #define floor(x) _mm256_floor_pd(x) +#ifndef __amd64__ +#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \ + __a[N];})) +#endif + void rq_encode(unsigned char *c,const modq *f) { crypto_int32 f0, f1, f2, f3, f4; diff --git a/crypto/libntrup/src/avx/rq_mod3.c b/crypto/libntrup/src/avx/rq_mod3.c index c2b7059b5..ff4ed0c4f 100644 --- a/crypto/libntrup/src/avx/rq_mod3.c +++ b/crypto/libntrup/src/avx/rq_mod3.c @@ -1,5 +1,6 @@ #if __AVX2__ #include +#include #include "mod3.h" #include "rq.h" @@ -9,6 +10,11 @@ #define v4591_16 _mm256_set1_epi16(4591) #define v10923_16 _mm256_set1_epi16(10923) +#ifndef __amd64__ +#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \ + __a[N];})) +#endif + static inline __m256i squeeze(__m256i x) { __m256i q = _mm256_mulhrs_epi16(x,v7); diff --git a/include/llarp/ip.hpp b/include/llarp/ip.hpp index 1a4fa0e5c..8ab3e17b2 100644 --- a/include/llarp/ip.hpp +++ b/include/llarp/ip.hpp @@ -5,13 +5,13 @@ #include #ifndef _WIN32 +// unix, linux #include // FreeBSD needs this for uchar for ip.h #include #include #else +// windows nt #include -// Apparently this does not seem to be located _anywhere_ in the windows sdk??? -// -despair86 typedef struct ip_hdr { unsigned char @@ -46,16 +46,7 @@ typedef struct ip_hdr #include -#if !defined(__linux__) && !defined(_WIN32) && !defined(__APPLE__) \ - && !defined(__FreeBSD__) -#define iphdr ip -#define saddr ip_src.s_addr -#define daddr ip_dst.s_addr -#define ip_version ip_v -#define check ip_sum -#define ihl ip_hl -#endif - +// anything not win32 struct ip_header { #if __BYTE_ORDER == __LITTLE_ENDIAN diff --git a/vendor/cppbackport-master/lib/fs/diriter.cpp b/vendor/cppbackport-master/lib/fs/diriter.cpp index ea8232026..01e7a8601 100644 --- a/vendor/cppbackport-master/lib/fs/diriter.cpp +++ b/vendor/cppbackport-master/lib/fs/diriter.cpp @@ -34,7 +34,9 @@ #include #include +#ifdef _WIN32 #include +#endif #include "direntry.h" #include "path.h" diff --git a/vendor/libtuntap-master/tuntap-unix-openbsd.c b/vendor/libtuntap-master/tuntap-unix-openbsd.c index 830540162..487ff4822 100644 --- a/vendor/libtuntap-master/tuntap-unix-openbsd.c +++ b/vendor/libtuntap-master/tuntap-unix-openbsd.c @@ -135,7 +135,6 @@ tuntap_sys_start(struct device *dev, int mode, int tun) { "Can't get link-layer address"); return fd; } - (void)memcpy(dev->hwaddr, &addr, ETHER_ADDR_LEN); } return fd; }