From e3a94101b4757e3ec9a3981e58882e75c40bba0d Mon Sep 17 00:00:00 2001
From: despair86 <despair@rvx86.net>
Date: Wed, 26 Sep 2018 21:19:34 -0500
Subject: [PATCH] the AVX2 codepaths now appear to be 32-bit clean.

old hard-coded inline asm is still included if requested.
-rick

nb: is a vector of eight floats not the same layout as a simple linear array of same? (Aside from the alignment requirements)

netbsd-family build fixes, also - the AVX2 codepaths are _compiler-specific_, they use features _exclusive_ to gcc and clang
---
 crypto/libntrup/src/avx/mult.c                | 26 ++++++++++++++++---
 crypto/libntrup/src/avx/rq.c                  |  5 ++++
 crypto/libntrup/src/avx/rq_mod3.c             |  6 +++++
 include/llarp/ip.hpp                          | 15 +++--------
 vendor/cppbackport-master/lib/fs/diriter.cpp  |  2 ++
 vendor/libtuntap-master/tuntap-unix-openbsd.c |  1 -
 6 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/crypto/libntrup/src/avx/mult.c b/crypto/libntrup/src/avx/mult.c
index e01e1cd33..b0e9e609d 100644
--- a/crypto/libntrup/src/avx/mult.c
+++ b/crypto/libntrup/src/avx/mult.c
@@ -8,12 +8,12 @@
 #define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \
   gj = g[j]; \
   h0 += f0 * gj; \
-  _mm256_storeu_ps(&h[i + j],h0); \
+  _mm256_storeu_ps((float*)&h[i + j],h0); \
   h1 += f1 * gj; \
   h2 += f2 * gj; \
   h3 += f3 * gj; \
   h4 += f4 * gj; \
-  h0 = _mm256_loadu_ps(&h[i + j + 5]); \
+  h0 = _mm256_loadu_ps((float*)&h[i + j + 5]); \
   h0 += f5 * gj;
 
 #define MULSTEP_asm(j,h0,h1,h2,h3,h4) \
@@ -30,9 +30,19 @@
     : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
     : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]),"m"(h[i+j+5]));
 
-#define MULSTEP MULSTEP_asm
+#define MULSTEP MULSTEP_gcc
 
 #define MULSTEP_noload(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  h0 += gj*f0; \
+  _mm256_storeu_ps((float*)&h[i+j], h0); \
+  h1 += gj*f1; \
+  h2 += gj*f2; \
+  h3 += gj*f3; \
+  h4 += gj*f4; \
+  h0 = gj* f5;
+
+#define MULSTEP_noload_asm(j,h0,h1,h2,h3,h4) \
   gj = g[j]; \
   __asm__( \
     "vfmadd231ps %5,%6,%0 \n\t" \
@@ -46,6 +56,16 @@
     : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
 
 #define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  h0 = gj*f0; \
+  _mm256_storeu_ps((float*)&h[i+j], h0); \
+  h1 = gj*f1; \
+  h2 = gj*f2; \
+  h3 = gj*f3; \
+  h4 = gj*f4; \
+  h0 = gj*f5;
+
+#define MULSTEP_fromzero_asm(j,h0,h1,h2,h3,h4) \
   gj = g[j]; \
   __asm__( \
     "vmulps %5,%6,%0 \n\t" \
diff --git a/crypto/libntrup/src/avx/rq.c b/crypto/libntrup/src/avx/rq.c
index b6dc8ab6c..55e1c132d 100644
--- a/crypto/libntrup/src/avx/rq.c
+++ b/crypto/libntrup/src/avx/rq.c
@@ -21,6 +21,11 @@
 #define broadcast(r) _mm256_set1_pd(r)
 #define floor(x) _mm256_floor_pd(x)
 
+#ifndef __amd64__
+#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
+                                                  __a[N];}))
+#endif
+
 void rq_encode(unsigned char *c,const modq *f)
 {
   crypto_int32 f0, f1, f2, f3, f4;
diff --git a/crypto/libntrup/src/avx/rq_mod3.c b/crypto/libntrup/src/avx/rq_mod3.c
index c2b7059b5..ff4ed0c4f 100644
--- a/crypto/libntrup/src/avx/rq_mod3.c
+++ b/crypto/libntrup/src/avx/rq_mod3.c
@@ -1,5 +1,6 @@
 #if __AVX2__
 #include <immintrin.h>
+#include <smmintrin.h>
 #include "mod3.h"
 #include "rq.h"
 
@@ -9,6 +10,11 @@
 #define v4591_16 _mm256_set1_epi16(4591)
 #define v10923_16 _mm256_set1_epi16(10923)
 
+#ifndef __amd64__
+#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
+                                                  __a[N];}))
+#endif
+
 static inline __m256i squeeze(__m256i x)
 {
   __m256i q = _mm256_mulhrs_epi16(x,v7);
diff --git a/include/llarp/ip.hpp b/include/llarp/ip.hpp
index 1a4fa0e5c..8ab3e17b2 100644
--- a/include/llarp/ip.hpp
+++ b/include/llarp/ip.hpp
@@ -5,13 +5,13 @@
 #include <llarp/net.hpp>
 
 #ifndef _WIN32
+// unix, linux
 #include <sys/types.h>  // FreeBSD needs this for uchar for ip.h
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #else
+// windows nt
 #include <winsock2.h>
-// Apparently this does not seem to be located _anywhere_ in the windows sdk???
-// -despair86
 typedef struct ip_hdr
 {
   unsigned char
@@ -46,16 +46,7 @@ typedef struct ip_hdr
 
 #include <memory>
 
-#if !defined(__linux__) && !defined(_WIN32) && !defined(__APPLE__) \
-    && !defined(__FreeBSD__)
-#define iphdr ip
-#define saddr ip_src.s_addr
-#define daddr ip_dst.s_addr
-#define ip_version ip_v
-#define check ip_sum
-#define ihl ip_hl
-#endif
-
+// anything not win32
 struct ip_header
 {
 #if __BYTE_ORDER == __LITTLE_ENDIAN
diff --git a/vendor/cppbackport-master/lib/fs/diriter.cpp b/vendor/cppbackport-master/lib/fs/diriter.cpp
index ea8232026..01e7a8601 100644
--- a/vendor/cppbackport-master/lib/fs/diriter.cpp
+++ b/vendor/cppbackport-master/lib/fs/diriter.cpp
@@ -34,7 +34,9 @@
 #include <climits>
 
 #include <dirent.h>
+#ifdef _WIN32
 #include <io.h>
+#endif
 
 #include "direntry.h"
 #include "path.h"
diff --git a/vendor/libtuntap-master/tuntap-unix-openbsd.c b/vendor/libtuntap-master/tuntap-unix-openbsd.c
index 830540162..487ff4822 100644
--- a/vendor/libtuntap-master/tuntap-unix-openbsd.c
+++ b/vendor/libtuntap-master/tuntap-unix-openbsd.c
@@ -135,7 +135,6 @@ tuntap_sys_start(struct device *dev, int mode, int tun) {
 			    "Can't get link-layer address");
 			return fd;
 		}
-		(void)memcpy(dev->hwaddr, &addr, ETHER_ADDR_LEN);
 	}
 	return fd;
 }