the AVX2 codepaths now appear to be 32-bit clean.

old hard-coded inline asm is still included if requested.
-rick

nb: is a vector of eight floats not the same layout as a simple linear array of same? (Aside from the alignment requirements)

netbsd-family build fixes, also - the AVX2 codepaths are _compiler-specific_, they use features _exclusive_ to gcc and clang
pull/18/head
despair86 6 years ago
parent 7f809eb53b
commit e3a94101b4

@ -8,12 +8,12 @@
#define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \
gj = g[j]; \
h0 += f0 * gj; \
_mm256_storeu_ps(&h[i + j],h0); \
_mm256_storeu_ps((float*)&h[i + j],h0); \
h1 += f1 * gj; \
h2 += f2 * gj; \
h3 += f3 * gj; \
h4 += f4 * gj; \
h0 = _mm256_loadu_ps(&h[i + j + 5]); \
h0 = _mm256_loadu_ps((float*)&h[i + j + 5]); \
h0 += f5 * gj;
#define MULSTEP_asm(j,h0,h1,h2,h3,h4) \
@ -30,9 +30,19 @@
: "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
: "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]),"m"(h[i+j+5]));
#define MULSTEP MULSTEP_asm
#define MULSTEP MULSTEP_gcc
#define MULSTEP_noload(j,h0,h1,h2,h3,h4) \
gj = g[j]; \
h0 += gj*f0; \
_mm256_storeu_ps((float*)&h[i+j], h0); \
h1 += gj*f1; \
h2 += gj*f2; \
h3 += gj*f3; \
h4 += gj*f4; \
h0 = gj* f5;
#define MULSTEP_noload_asm(j,h0,h1,h2,h3,h4) \
gj = g[j]; \
__asm__( \
"vfmadd231ps %5,%6,%0 \n\t" \
@ -46,6 +56,16 @@
: "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
#define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \
gj = g[j]; \
h0 = gj*f0; \
_mm256_storeu_ps((float*)&h[i+j], h0); \
h1 = gj*f1; \
h2 = gj*f2; \
h3 = gj*f3; \
h4 = gj*f4; \
h0 = gj*f5;
#define MULSTEP_fromzero_asm(j,h0,h1,h2,h3,h4) \
gj = g[j]; \
__asm__( \
"vmulps %5,%6,%0 \n\t" \

@ -21,6 +21,11 @@
#define broadcast(r) _mm256_set1_pd(r)
#define floor(x) _mm256_floor_pd(x)
#ifndef __amd64__
#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
__a[N];}))
#endif
void rq_encode(unsigned char *c,const modq *f)
{
crypto_int32 f0, f1, f2, f3, f4;

@ -1,5 +1,6 @@
#if __AVX2__
#include <immintrin.h>
#include <smmintrin.h>
#include "mod3.h"
#include "rq.h"
@ -9,6 +10,11 @@
#define v4591_16 _mm256_set1_epi16(4591)
#define v10923_16 _mm256_set1_epi16(10923)
#ifndef __amd64__
#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
__a[N];}))
#endif
static inline __m256i squeeze(__m256i x)
{
__m256i q = _mm256_mulhrs_epi16(x,v7);

@ -5,13 +5,13 @@
#include <llarp/net.hpp>
#ifndef _WIN32
// unix, linux
#include <sys/types.h> // FreeBSD needs this for uchar for ip.h
#include <netinet/in.h>
#include <netinet/ip.h>
#else
// windows nt
#include <winsock2.h>
// Apparently this does not seem to be located _anywhere_ in the windows sdk???
// -despair86
typedef struct ip_hdr
{
unsigned char
@ -46,16 +46,7 @@ typedef struct ip_hdr
#include <memory>
#if !defined(__linux__) && !defined(_WIN32) && !defined(__APPLE__) \
&& !defined(__FreeBSD__)
#define iphdr ip
#define saddr ip_src.s_addr
#define daddr ip_dst.s_addr
#define ip_version ip_v
#define check ip_sum
#define ihl ip_hl
#endif
// anything not win32
struct ip_header
{
#if __BYTE_ORDER == __LITTLE_ENDIAN

@ -34,7 +34,9 @@
#include <climits>
#include <dirent.h>
#ifdef _WIN32
#include <io.h>
#endif
#include "direntry.h"
#include "path.h"

@ -135,7 +135,6 @@ tuntap_sys_start(struct device *dev, int mode, int tun) {
"Can't get link-layer address");
return fd;
}
(void)memcpy(dev->hwaddr, &addr, ETHER_ADDR_LEN);
}
return fd;
}

Loading…
Cancel
Save