import ntru prime code

6 years ago · a60c4b0bef
parent d7c1c3322f
commit a60c4b0bef
101 changed files with 6265 additions and 0 deletions
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/api.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/api.h
@ -0,0 +1,4 @@
+#define CRYPTO_SECRETKEYBYTES 1238
+#define CRYPTO_PUBLICKEYBYTES 1047
+#define CRYPTO_CIPHERTEXTBYTES 1175
+#define CRYPTO_BYTES 32
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/dec.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/dec.c
@ -0,0 +1,57 @@
+#ifdef KAT
+#include <stdio.h>
+#endif
+
+#include "params.h"
+#include "small.h"
+#include "rq.h"
+#include "hide.h"
+#include "crypto_kem.h"
+
+static int verify(const unsigned char *x,const unsigned char *y)
+{
+  unsigned int differentbits = 0;
+  int i;
+  for (i = 0;i < crypto_kem_CIPHERTEXTBYTES;++i)
+    differentbits |= x[i] ^ y[i];
+  return (1 & ((differentbits - 1) >> 8)) - 1;
+}
+
+int crypto_kem_dec(
+  unsigned char *k,
+  const unsigned char *cstr,
+  const unsigned char *sk
+)
+{
+  modq buf[768];
+#define B buf
+#define aB buf
+  small a[768];
+  unsigned char r[32];
+  unsigned char checkcstr[crypto_kem_CIPHERTEXTBYTES];
+  unsigned char maybek[32];
+  int i;
+  int result;
+
+  small_decode(a,sk); sk += small_encode_len;
+  rq_decoderounded(B,cstr + 32);
+  rq_mult(aB,B,a);
+
+  rq_rightsubbit(r,cstr + 32 + rq_encoderounded_len,aB);
+
+#ifdef KAT
+  {
+    int j;
+    printf("decrypt r: ");
+    for (j = 0;j < 32;++j)
+      printf("%02x",255 & (int) r[j]);
+    printf("\n");
+  }
+#endif
+
+  hide(checkcstr,maybek,sk,r);
+  result = verify(cstr,checkcstr);
+
+  for (i = 0;i < 32;++i) k[i] = maybek[i] & ~result;
+  return result;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/enc.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/enc.c
@ -0,0 +1,30 @@
+#ifdef KAT
+#include <stdio.h>
+#endif
+
+#include "hide.h"
+#include "randombytes.h"
+#include "crypto_kem.h"
+
+int crypto_kem_enc(
+  unsigned char *cstr,
+  unsigned char *k,
+  const unsigned char *pk
+)
+{
+  unsigned char r[32];
+  randombytes(r,32);
+
+#ifdef KAT
+  {
+    int i;
+    printf("encrypt r: ");
+    for (i = 0;i < 32;++i)
+      printf("%02x",255 & (int) r[i]);
+    printf("\n");
+  }
+#endif
+
+  hide(cstr,k,pk,r);
+  return 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.c
@ -0,0 +1,40 @@
+#include <string.h>
+#include "crypto_hash_sha512.h"
+#include "crypto_kem.h"
+#include "params.h"
+#include "rq.h"
+#include "hide.h"
+
+#if crypto_kem_CIPHERTEXTBYTES != rq_encoderounded_len + 32 + 128
+#error "crypto_kem_CIPHERTEXTBYTES must match rq_encoderounded_len + 32 + 128"
+#endif
+
+void hide(unsigned char *cstr,unsigned char *k,const unsigned char *pk,const unsigned char *r)
+{
+  modq buf[768];
+#define G buf
+#define A buf
+#define B buf
+#define C buf
+  unsigned char k12[64];
+  unsigned char k34[64];
+  small b[768];
+
+  crypto_hash_sha512(k12,r,32);
+  small_seeded_weightw(b,k12);
+
+  crypto_hash_sha512(k34,k12 + 32,32);
+  memcpy(cstr,k34,32); cstr += 32;
+  memcpy(k,k34 + 32,32);
+
+  rq_fromseed(G,pk);
+  rq_mult(B,G,b);
+  /* XXX: cache transform of b for next mult */
+  /* XXX: cache transform of G inside sk */
+  /* XXX: cache transform of G when pk is otherwise reused */
+  rq_roundencode(cstr,B); cstr += rq_encoderounded_len;
+
+  rq_decoderounded(A,pk + 32);
+  rq_mult(C,A,b);
+  rq_top(cstr,C,r);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.h
@ -0,0 +1,9 @@
+#ifndef hide_h
+#define hide_h
+
+#include "crypto_int32.h"
+
+#define hide crypto_kem_ntrulpr4591761_avx_hide
+extern void hide(unsigned char *,unsigned char *,const unsigned char *,const unsigned char *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/implementors
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/implementors
@ -0,0 +1,5 @@
+Alphabetical order:
+Daniel J. Bernstein
+Chitchanok Chuengsatiansup
+Tanja Lange
+Christine van Vredendaal
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.c
@ -0,0 +1,425 @@
+#include "int32_sort.h"
+#include <immintrin.h>
+
+typedef crypto_int32 int32;
+
+static inline void minmax(int32 *x,int32 *y)
+{
+  asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg %%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
+    : : "r"(x),"r"(y) : "%eax","%ebx","%edx");
+}
+
+/* sort x0,x2; sort x1,x3; ... sort x13, x15 */
+static inline void minmax02through1315(int32 *x)
+{
+  __m256i a = _mm256_loadu_si256((__m256i *) x);
+  __m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
+  __m256i c = _mm256_unpacklo_epi64(a,b); /* a01b01a45b45 */
+  __m256i d = _mm256_unpackhi_epi64(a,b); /* a23b23a67b67 */
+  __m256i g = _mm256_min_epi32(c,d);
+  __m256i h = _mm256_max_epi32(c,d);
+  a = _mm256_unpacklo_epi64(g,h);
+  b = _mm256_unpackhi_epi64(g,h);
+  _mm256_storeu_si256((__m256i *) x,a);
+  _mm256_storeu_si256((__m256i *) (x + 8),b);
+}
+
+/* sort x0,x2; sort x1,x3; sort x4,x6; sort x5,x7 */
+static inline void minmax02134657(int32 *x)
+{
+  __m256i a = _mm256_loadu_si256((__m256i *) x);
+  __m256i b = _mm256_shuffle_epi32(a,0x4e);
+  __m256i c = _mm256_cmpgt_epi32(a,b);
+  c = _mm256_shuffle_epi32(c,0x44);
+  __m256i abc = c & (a ^ b);
+  a ^= abc;
+  _mm256_storeu_si256((__m256i *) x,a);
+}
+
+static void multiminmax2plus2(
+  int32 *x,
+  int n)
+{
+  while (n >= 16) {
+    minmax02through1315(x);
+    n -= 16;
+    x += 16;
+  }
+  if (n >= 8) {
+    minmax02134657(x);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + 2);
+    minmax(x + 1,x + 3);
+    n -= 4;
+    x += 4;
+  }
+  if (n > 0) {
+    minmax(x,x + 2);
+    if (n > 1) minmax(x + 1,x + 3);
+  }
+}
+
+static void multiminmax2plus6(
+  int32 *x,
+  int n)
+{
+  while (n >= 4) {
+    minmax(x,x + 6);
+    minmax(x + 1,x + 7);
+    n -= 4;
+    x += 4;
+  }
+  if (n > 0) {
+    minmax(x,x + 6);
+    if (n > 1) minmax(x + 1,x + 7);
+  }
+}
+
+static void multiminmax2plus14(
+  int32 *x,
+  int n)
+{
+  while (n >= 8) {
+    minmax(x,x + 14);
+    minmax(x + 1,x + 15);
+    minmax(x + 4,x + 18);
+    minmax(x + 5,x + 19);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + 14);
+    minmax(x + 1,x + 15);
+    n -= 4;
+    x += 4;
+  }
+  if (n > 0) {
+    minmax(x,x + 14);
+    if (n > 1) minmax(x + 1,x + 15);
+  }
+}
+
+/* sort x[i],y[i] for i in 0,1,4,5,8,9,12,13 */
+/* all of x0...x15 and y0...y15 must exist; no aliasing */
+static inline void minmax0145891213(int32 *x,int32 *y)
+{
+  __m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
+  __m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
+  __m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
+  __m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
+
+  __m256i a0189451213 = _mm256_unpacklo_epi64(a01234567,a89101112131415);
+  __m256i b0189451213 = _mm256_unpacklo_epi64(b01234567,b89101112131415);
+  __m256i c0189451213 = _mm256_min_epi32(a0189451213,b0189451213);
+  __m256i d0189451213 = _mm256_max_epi32(a0189451213,b0189451213);
+
+  __m256i c01234567 = _mm256_blend_epi32(a01234567,c0189451213,0x33);
+  __m256i d01234567 = _mm256_blend_epi32(b01234567,d0189451213,0x33);
+  __m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213,a89101112131415);
+  __m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213,b89101112131415);
+
+  _mm256_storeu_si256((__m256i *) x,c01234567);
+  _mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
+  _mm256_storeu_si256((__m256i *) y,d01234567);
+  _mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
+}
+
+/* offset >= 30 */
+static void multiminmax2plusmore(
+  int32 *x,
+  int n,
+  int offset)
+{
+  while (n >= 16) {
+    minmax0145891213(x,x + offset);
+    n -= 16;
+    x += 16;
+  }
+  if (n >= 8) {
+    minmax(x,x + offset);
+    minmax(x + 1,x + 1 + offset);
+    minmax(x + 4,x + 4 + offset);
+    minmax(x + 5,x + 5 + offset);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + offset);
+    minmax(x + 1,x + 1 + offset);
+    n -= 4;
+    x += 4;
+  }
+  if (n > 0) {
+    minmax(x,x + offset);
+    if (n > 1) minmax(x + 1,x + 1 + offset);
+  }
+}
+
+/* sort x0,x1; ... sort x14, x15 */
+static inline void minmax01through1415(int32 *x)
+{
+  __m256i a = _mm256_loadu_si256((__m256i *) x);
+  __m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
+  __m256i c = _mm256_unpacklo_epi32(a,b); /* ab0ab1ab4ab5 */
+  __m256i d = _mm256_unpackhi_epi32(a,b); /* ab2ab3ab6ab7 */
+  __m256i e = _mm256_unpacklo_epi32(c,d); /* a02b02a46b46 */
+  __m256i f = _mm256_unpackhi_epi32(c,d); /* a13b13a57b57 */
+  __m256i g = _mm256_min_epi32(e,f); /* a02b02a46b46 */
+  __m256i h = _mm256_max_epi32(e,f); /* a13b13a57b57 */
+  a = _mm256_unpacklo_epi32(g,h);
+  b = _mm256_unpackhi_epi32(g,h);
+  _mm256_storeu_si256((__m256i *) x,a);
+  _mm256_storeu_si256((__m256i *) (x + 8),b);
+}
+
+/* sort x0,x1; sort x2,x3; sort x4,x5; sort x6,x7 */
+static inline void minmax01234567(int32 *x)
+{
+  __m256i a = _mm256_loadu_si256((__m256i *) x);
+  __m256i b = _mm256_shuffle_epi32(a,0xb1);
+  __m256i c = _mm256_cmpgt_epi32(a,b);
+  c = _mm256_shuffle_epi32(c,0xa0);
+  __m256i abc = c & (a ^ b);
+  a ^= abc;
+  _mm256_storeu_si256((__m256i *) x,a);
+}
+
+static void multiminmax1plus1(
+  int32 *x,
+  int n)
+{
+  while (n >= 16) {
+    minmax01through1415(x);
+    n -= 16;
+    x += 16;
+  }
+  if (n >= 8) {
+    minmax01234567(x);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + 1);
+    minmax(x + 2,x + 3);
+    n -= 4;
+    x += 4;
+  }
+  if (n >= 2) {
+    minmax(x,x + 1);
+    n -= 2;
+    x += 2;
+  }
+  if (n > 0)
+    minmax(x,x + 1);
+}
+
+static void multiminmax1(
+  int32 *x,
+  int n,
+  int offset)
+{
+  while (n >= 16) {
+    minmax(x,x + offset);
+    minmax(x + 2,x + 2 + offset);
+    minmax(x + 4,x + 4 + offset);
+    minmax(x + 6,x + 6 + offset);
+    minmax(x + 8,x + 8 + offset);
+    minmax(x + 10,x + 10 + offset);
+    minmax(x + 12,x + 12 + offset);
+    minmax(x + 14,x + 14 + offset);
+    n -= 16;
+    x += 16;
+  }
+  if (n >= 8) {
+    minmax(x,x + offset);
+    minmax(x + 2,x + 2 + offset);
+    minmax(x + 4,x + 4 + offset);
+    minmax(x + 6,x + 6 + offset);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + offset);
+    minmax(x + 2,x + 2 + offset);
+    n -= 4;
+    x += 4;
+  }
+  if (n >= 2) {
+    minmax(x,x + offset);
+    n -= 2;
+    x += 2;
+  }
+  if (n > 0)
+    minmax(x,x + offset);
+}
+
+/* sort x[i],y[i] for i in 0,2,4,6,8,10,12,14 */
+/* all of x0...x15 and y0...y15 must exist; no aliasing */
+static inline void minmax02468101214(int32 *x,int32 *y)
+{
+  __m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
+  __m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
+  __m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
+  __m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
+
+  __m256i a0819412513 = _mm256_unpacklo_epi32(a01234567,a89101112131415);
+  __m256i a210311614715 = _mm256_unpackhi_epi32(a01234567,a89101112131415);
+  __m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513,a210311614715);
+  __m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513,a210311614715);
+
+  __m256i b0819412513 = _mm256_unpacklo_epi32(b01234567,b89101112131415);
+  __m256i b210311614715 = _mm256_unpackhi_epi32(b01234567,b89101112131415);
+  __m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513,b210311614715);
+  __m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513,b210311614715);
+
+  __m256i c02810461214 = _mm256_min_epi32(a02810461214,b02810461214);
+  __m256i d02810461214 = _mm256_max_epi32(a02810461214,b02810461214);
+
+  __m256i c01234567 = _mm256_unpacklo_epi32(c02810461214,a13911571315);
+  __m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214,a13911571315);
+  __m256i d01234567 = _mm256_unpacklo_epi32(d02810461214,b13911571315);
+  __m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214,b13911571315);
+  
+  _mm256_storeu_si256((__m256i *) x,c01234567);
+  _mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
+  _mm256_storeu_si256((__m256i *) y,d01234567);
+  _mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
+}
+
+/* assumes offset >= 31 */
+static void multiminmax1plusmore(
+  int32 *x,
+  int n,
+  int offset)
+{
+  while (n >= 16) {
+    minmax02468101214(x,x + offset);
+    n -= 16;
+    x += 16;
+  }
+  if (n >= 8) {
+    minmax(x,x + offset);
+    minmax(x + 2,x + 2 + offset);
+    minmax(x + 4,x + 4 + offset);
+    minmax(x + 6,x + 6 + offset);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + offset);
+    minmax(x + 2,x + 2 + offset);
+    n -= 4;
+    x += 4;
+  }
+  if (n >= 2) {
+    minmax(x,x + offset);
+    n -= 2;
+    x += 2;
+  }
+  if (n > 0)
+    minmax(x,x + offset);
+}
+
+/* sort x0,y0; sort x1,y1; ...; sort x7,y7 */
+static inline void minmax8(int32 *x,int32 *y)
+{
+  __m256i a = _mm256_loadu_si256((__m256i *) x);
+  __m256i b = _mm256_loadu_si256((__m256i *) y);
+  _mm256_storeu_si256((__m256i *) x,_mm256_min_epi32(a,b));
+  _mm256_storeu_si256((__m256i *) y,_mm256_max_epi32(a,b));
+}
+
+/* assumes p >= 8; implies offset >= 8 */
+static void multiminmax_atleast8(int p,
+  int32 *x,
+  int n,
+  int offset)
+{
+  int i;
+  while (n >= 2 * p) {
+    for (i = 0;i < p;i += 8)
+      minmax8(x + i,x + i + offset);
+    n -= 2 * p;
+    x += 2 * p;
+  }
+  for (i = 0;i + 8 <= n;i += 8) {
+    if (i & p) return;
+    minmax8(x + i,x + i + offset);
+  }
+  for (;i < n;++i) {
+    if (i & p) return;
+    minmax(x + i,x + i + offset);
+  }
+}
+
+/* sort x0,y0; sort x1,y1; sort x2,y2; sort x3,y3 */
+static inline void minmax4(int32 *x,int32 *y)
+{
+  __m128i a = _mm_loadu_si128((__m128i *) x);
+  __m128i b = _mm_loadu_si128((__m128i *) y);
+  _mm_storeu_si128((__m128i *) x,_mm_min_epi32(a,b));
+  _mm_storeu_si128((__m128i *) y,_mm_max_epi32(a,b));
+}
+
+static void multiminmax4(
+  int32 *x,
+  int n,
+  int offset)
+{
+  int i;
+  while (n >= 8) {
+    minmax4(x,x + offset);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4)
+    minmax4(x,x + offset);
+  else
+    for (i = 0;i < n;++i)
+      minmax(x + i,x + i + offset);
+}
+
+void int32_sort(int32 *x,int n)
+{
+  int top,p,q;
+
+  if (n < 2) return;
+  top = 1;
+  while (top < n - top) top += top;
+
+  for (p = top;p >= 8;p >>= 1) {
+    multiminmax_atleast8(p,x,n - p,p);
+    for (q = top;q > p;q >>= 1)
+      multiminmax_atleast8(p,x + p,n - q,q - p);
+  }
+  if (p >= 4) {
+    multiminmax4(x,n - 4,4);
+    for (q = top;q > 4;q >>= 1)
+      multiminmax4(x + 4,n - q,q - 4);
+  }
+  if (p >= 2) {
+    multiminmax2plus2(x,n - 2);
+    for (q = top;q >= 32;q >>= 1)
+      multiminmax2plusmore(x + 2,n - q,q - 2);
+    if (q >= 16)
+      multiminmax2plus14(x + 2,n - 16);
+    if (q >= 8)
+      multiminmax2plus6(x + 2,n - 8);
+    if (q >= 4)
+      multiminmax2plus2(x + 2,n - 4);
+  }
+  multiminmax1plus1(x,n - 1);
+  for (q = top;q >= 32;q >>= 1)
+    multiminmax1plusmore(x + 1,n - q,q - 1);
+  if (q >= 16)
+    multiminmax1(x + 1,n - 16,15);
+  if (q >= 8)
+    multiminmax1(x + 1,n - 8,7);
+  if (q >= 4)
+    multiminmax1(x + 1,n - 4,3);
+  if (q >= 2)
+    multiminmax1plus1(x + 1,n - 2);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.h
@ -0,0 +1,9 @@
+#ifndef int32_sort_h
+#define int32_sort_h
+
+#include "crypto_int32.h"
+
+#define int32_sort crypto_kem_ntrulpr4591761_avx_int32_sort
+extern void int32_sort(crypto_int32 *,int);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/keypair.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/keypair.c
@ -0,0 +1,37 @@
+#include <string.h>
+#include "modq.h"
+#include "params.h"
+#include "small.h"
+#include "rq.h"
+#include "crypto_kem.h"
+#include "randombytes.h"
+#include "crypto_stream_aes256ctr.h"
+
+#if crypto_kem_PUBLICKEYBYTES != rq_encoderounded_len + 32
+#error "crypto_kem_PUBLICKEYBYTES must match rq_encoderounded_len + 32"
+#endif
+#if crypto_kem_SECRETKEYBYTES != small_encode_len + crypto_kem_PUBLICKEYBYTES
+#error "crypto_kem_SECRETKEYBYTES must match small_encode_len + crypto_kem_PUBLICKEYBYTES"
+#endif
+
+int crypto_kem_keypair(unsigned char *pk,unsigned char *sk)
+{
+  modq buf[768];
+#define G buf
+#define A buf
+  small a[768];
+
+  randombytes(pk,32);
+  rq_fromseed(G,pk);
+
+  small_random_weightw(a);
+
+  rq_mult(A,G,a);
+
+  rq_roundencode(pk + 32,A);
+
+  small_encode(sk,a);
+  memcpy(sk + small_encode_len,pk,crypto_kem_PUBLICKEYBYTES);
+
+  return 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/modq.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/modq.h
@ -0,0 +1,36 @@
+#ifndef modq_h
+#define modq_h
+
+#include "crypto_int16.h"
+#include "crypto_int32.h"
+#include "crypto_uint16.h"
+#include "crypto_uint32.h"
+
+typedef crypto_int16 modq;
+
+/* input between -9000000 and 9000000 */
+/* output between -2295 and 2295 */
+static inline modq modq_freeze(crypto_int32 a)
+{
+  a -= 4591 * ((228 * a) >> 20);
+  a -= 4591 * ((58470 * a + 134217728) >> 28);
+  return a;
+}
+
+/* input between 0 and 4294967295 */
+/* output = (input % 4591) - 2295 */
+static inline modq modq_fromuint32(crypto_uint32 a)
+{
+  crypto_int32 r;
+  r = (a & 524287) + (a >> 19) * 914; /* <= 8010861 */
+  return modq_freeze(r - 2295);
+}
+
+static inline modq modq_sum(modq a,modq b)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  return modq_freeze(A + B);
+}
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/mult.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/mult.c
@ -0,0 +1,738 @@
+#include <string.h>
+#include <immintrin.h>
+#include "rq.h"
+
+#define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  h0 += f0 * gj; \
+  _mm256_storeu_ps(&h[i + j],h0); \
+  h1 += f1 * gj; \
+  h2 += f2 * gj; \
+  h3 += f3 * gj; \
+  h4 += f4 * gj; \
+  h0 = _mm256_loadu_ps(&h[i + j + 5]); \
+  h0 += f5 * gj;
+
+#define MULSTEP_asm(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  __asm__( \
+    "vfmadd231ps %5,%6,%0 \n\t" \
+    "vmovups %0,%12 \n\t" \
+    "vmovups %13,%0 \n\t" \
+    "vfmadd231ps %5,%7,%1 \n\t" \
+    "vfmadd231ps %5,%8,%2 \n\t" \
+    "vfmadd231ps %5,%9,%3 \n\t" \
+    "vfmadd231ps %5,%10,%4 \n\t" \
+    "vfmadd231ps %5,%11,%0 \n\t" \
+    : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
+    : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]),"m"(h[i+j+5]));
+
+#define MULSTEP MULSTEP_asm
+
+#define MULSTEP_noload(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  __asm__( \
+    "vfmadd231ps %5,%6,%0 \n\t" \
+    "vmovups %0,%12 \n\t" \
+    "vfmadd231ps %5,%7,%1 \n\t" \
+    "vfmadd231ps %5,%8,%2 \n\t" \
+    "vfmadd231ps %5,%9,%3 \n\t" \
+    "vfmadd231ps %5,%10,%4 \n\t" \
+    "vmulps %5,%11,%0 \n\t" \
+    : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
+    : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
+
+#define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  __asm__( \
+    "vmulps %5,%6,%0 \n\t" \
+    "vmovups %0,%12 \n\t" \
+    "vmulps %5,%7,%1 \n\t" \
+    "vmulps %5,%8,%2 \n\t" \
+    "vmulps %5,%9,%3 \n\t" \
+    "vmulps %5,%10,%4 \n\t" \
+    "vmulps %5,%11,%0 \n\t" \
+    : "=&x"(h0),"=&x"(h1),"=&x"(h2),"=&x"(h3),"=&x"(h4) \
+    : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
+
+static inline __m128i _mm_load_cvtepi8_epi16(const long long *x)
+{
+  __m128i result;
+  __asm__("vpmovsxbw %1, %0" : "=x"(result) : "m"(*x));
+  return result;
+}
+
+#define v0 _mm256_set1_epi32(0)
+#define v0_128 _mm_set1_epi32(0)
+#define v7 _mm256_set1_epi16(7)
+#define v4591_16 _mm256_set1_epi16(4591)
+#define v2296_16 _mm256_set1_epi16(2296)
+
+#define alpha_32 _mm256_set1_epi32(0x4b400000)
+#define alpha_32_128 _mm_set1_epi32(0x4b400000)
+#define alpha_float _mm256_set1_ps(12582912.0)
+
+#define v0_float _mm256_set1_ps(0)
+#define v1_float _mm256_set1_ps(1)
+#define vm1_float _mm256_set1_ps(-1)
+#define vm4591_float _mm256_set1_ps(-4591)
+#define recip4591_float _mm256_set1_ps(0.00021781746896101067305597908952297974298)
+
+static inline __m256 add(__m256 x,__m256 y)
+{
+  return x + y;
+}
+
+static inline __m256 fastadd(__m256 x,__m256 y)
+{
+  return _mm256_fmadd_ps(y,v1_float,x);
+}
+
+static inline __m256 fastsub(__m256 x,__m256 y)
+{
+  return _mm256_fmadd_ps(y,vm1_float,x);
+}
+
+static inline __m256 reduce(__m256 x)
+{
+  __m256 q = x * recip4591_float;
+  q = _mm256_round_ps(q,8);
+  return _mm256_fmadd_ps(q,vm4591_float,x);
+}
+
+static inline __m256i squeeze(__m256i x)
+{
+  __m256i q = _mm256_mulhrs_epi16(x,v7);
+  q = _mm256_mullo_epi16(q,v4591_16);
+  return _mm256_sub_epi16(x,q);
+}
+
+static inline __m256i squeezeadd16(__m256i x,__m256i y)
+{
+  __m256i q;
+  x = _mm256_add_epi16(x,y);
+  q = _mm256_mulhrs_epi16(x,v7);
+  q = _mm256_mullo_epi16(q,v4591_16);
+  return _mm256_sub_epi16(x,q);
+}
+
+static inline __m256i freeze(__m256i x)
+{
+  __m256i mask, x2296, x4591;
+  x4591 = _mm256_add_epi16(x,v4591_16);
+  mask = _mm256_srai_epi16(x,15);
+  x = _mm256_blendv_epi8(x,x4591,mask);
+  x2296 = _mm256_sub_epi16(x,v2296_16);
+  mask = _mm256_srai_epi16(x2296,15);
+  x4591 = _mm256_sub_epi16(x,v4591_16);
+  x = _mm256_blendv_epi8(x4591,x,mask);
+  return x;
+}
+
+/* 24*8*float32 f inputs between -10000 and 10000 */
+/* 24*8*float32 g inputs between -32 and 32 */
+/* 48*8*float32 h outputs between -7680000 and 7680000 */
+static void mult24x8_float(__m256 h[48],const __m256 f[24],const __m256 g[24])
+{
+  int i, j;
+  __m256 f0, f1, f2, f3, f4, f5, gj, h0, h1, h2, h3, h4;
+
+  i = 0;
+  f0 = f[i];
+  f1 = f[i + 1];
+  f2 = f[i + 2];
+  f3 = f[i + 3];
+  f4 = f[i + 4];
+  f5 = f[i + 5];
+  MULSTEP_fromzero(0,h0,h1,h2,h3,h4)
+  for (j = 0;j < 20;j += 5) {
+    MULSTEP_noload(j + 1,h1,h2,h3,h4,h0)
+    MULSTEP_noload(j + 2,h2,h3,h4,h0,h1)
+    MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
+    MULSTEP_noload(j + 4,h4,h0,h1,h2,h3)
+    MULSTEP_noload(j + 5,h0,h1,h2,h3,h4)
+  }
+  MULSTEP_noload(j + 1,h1,h2,h3,h4,h0)
+  MULSTEP_noload(j + 2,h2,h3,h4,h0,h1)
+  MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
+  h[i + j + 4] = h4;
+  h[i + j + 5] = h0;
+  h[i + j + 6] = h1;
+  h[i + j + 7] = h2;
+  h[i + j + 8] = h3;
+
+  for (i = 6;i < 24;i += 6) {
+    f0 = f[i];
+    f1 = f[i + 1];
+    f2 = f[i + 2];
+    f3 = f[i + 3];
+    f4 = f[i + 4];
+    f5 = f[i + 5];
+    h0 = h[i];
+    h1 = h[i + 1];
+    h2 = h[i + 2];
+    h3 = h[i + 3];
+    h4 = h[i + 4];
+    for (j = 0;j < 15;j += 5) {
+      MULSTEP(j + 0,h0,h1,h2,h3,h4)
+      MULSTEP(j + 1,h1,h2,h3,h4,h0)
+      MULSTEP(j + 2,h2,h3,h4,h0,h1)
+      MULSTEP(j + 3,h3,h4,h0,h1,h2)
+      MULSTEP(j + 4,h4,h0,h1,h2,h3)
+    }
+    MULSTEP(j + 0,h0,h1,h2,h3,h4)
+    MULSTEP(j + 1,h1,h2,h3,h4,h0)
+    MULSTEP(j + 2,h2,h3,h4,h0,h1)
+    MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
+    MULSTEP_noload(j + 4,h4,h0,h1,h2,h3)
+    MULSTEP_noload(j + 5,h0,h1,h2,h3,h4)
+    MULSTEP_noload(j + 6,h1,h2,h3,h4,h0)
+    MULSTEP_noload(j + 7,h2,h3,h4,h0,h1)
+    MULSTEP_noload(j + 8,h3,h4,h0,h1,h2)
+    h[i + j + 9] = h4;
+    h[i + j + 10] = h0;
+    h[i + j + 11] = h1;
+    h[i + j + 12] = h2;
+    h[i + j + 13] = h3;
+  }
+
+  h[47] = v0_float;
+}
+
+/* 48*8*float32 f inputs between -5000 and 5000 */
+/* 48*8*float32 g inputs between -16 and 16 */
+/* 96*8*float32 h outputs between -3840000 and 3840000 */
+static void mult48x8_float(__m256 h[96],const __m256 f[48],const __m256 g[48])
+{
+  __m256 h01[48];
+  __m256 g01[24];
+  __m256 *f01 = h01 + 24;
+  int i;
+
+  for (i = 24;i > 0;) {
+    i -= 2;
+    f01[i] = f[i] + f[i + 24];
+    g01[i] = g[i] + g[i + 24];
+    f01[i + 1] = f[i + 1] + f[i + 1 + 24];
+    g01[i + 1] = g[i + 1] + g[i + 1 + 24];
+  }
+
+  mult24x8_float(h,f,g);
+  mult24x8_float(h + 48,f + 24,g + 24);
+  mult24x8_float(h01,f01,g01);
+
+  for (i = 0;i < 24;++i) {
+    __m256 h0i = h[i];
+    __m256 h0itop = h[i + 24];
+    __m256 h1i = h[i + 48];
+    __m256 h1itop = h[i + 72];
+    __m256 h01i = h01[i];
+    __m256 h01itop = h01[i + 24];
+    __m256 c = fastsub(h0itop,h1i);
+    h[i + 24] = c + fastsub(h01i,h0i);
+    h[i + 48] = fastsub(h01itop,h1itop) - c;
+  }
+}
+
+/* 96*8*float32 f inputs between -2500 and 2500 */
+/* 96*8*float32 g inputs between -8 and 8 */
+/* 192*8*float32 h outputs between -1920000 and 1920000 */
+static void mult96x8_float(__m256 h[192],const __m256 f[96],const __m256 g[96])
+{
+  __m256 h01[96];
+  __m256 g01[48];
+  __m256 *f01 = h01 + 48;
+  int i;
+
+  for (i = 48;i > 0;) {
+    i -= 4;
+    f01[i] = f[i] + f[i + 48];
+    g01[i] = g[i] + g[i + 48];
+    f01[i + 1] = f[i + 1] + f[i + 1 + 48];
+    g01[i + 1] = g[i + 1] + g[i + 1 + 48];
+    f01[i + 2] = f[i + 2] + f[i + 2 + 48];
+    g01[i + 2] = g[i + 2] + g[i + 2 + 48];
+    f01[i + 3] = f[i + 3] + f[i + 3 + 48];
+    g01[i + 3] = g[i + 3] + g[i + 3 + 48];
+  }
+
+  mult48x8_float(h,f,g);
+  mult48x8_float(h + 96,f + 48,g + 48);
+  mult48x8_float(h01,f01,g01);
+
+  for (i = 0;i < 48;++i) {
+    __m256 h0i = h[i];
+    __m256 h0itop = h[i + 48];
+    __m256 h1i = h[i + 96];
+    __m256 h1itop = h[i + 144];
+    __m256 h01i = h01[i];
+    __m256 h01itop = h01[i + 48];
+    __m256 c = fastsub(h0itop,h1i);
+    h[i + 48] = c + fastsub(h01i,h0i);
+    h[i + 96] = fastsub(h01itop,h1itop) - c;
+  }
+}
+
+/* 96*16*int16 f inputs between -2500 and 2500 */
+/* 96*(16*int8 stored in 32*int8) g inputs between -8 and 8 */
+/* 192*16*int16 h outputs between -2400 and 2400 */
+static void mult96x16(__m256i h[192],const __m256i f[96],const __m256i g[96])
+{
+  __m256 hfloat[192];
+  __m256 gfloat[96];
+  __m256 *ffloat = hfloat + 96;
+  int i, p;
+
+  for (p = 0;p < 2;++p) {
+    for (i = 96;i > 0;) {
+      i -= 2;
+      __m256i fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i]));
+      __m256i gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i]));
+      __m256 storage;
+      *(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32);
+      ffloat[i] = storage - alpha_float;
+      *(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32);
+      gfloat[i] = storage - alpha_float;
+      fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i + 1]));
+      gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i + 1]));
+      *(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32);
+      ffloat[i + 1] = storage - alpha_float;
+      *(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32);
+      gfloat[i + 1] = storage - alpha_float;
+    }
+    mult96x8_float(hfloat,ffloat,gfloat);
+    for (i = 192;i > 0;) {
+      __m128i h0, h1;
+      i -= 4;
+      hfloat[i] = add(alpha_float,reduce(hfloat[i]));
+      hfloat[i + 1] = fastadd(alpha_float,reduce(hfloat[i + 1]));
+      hfloat[i + 2] = add(alpha_float,reduce(hfloat[i + 2]));
+      hfloat[i + 3] = fastadd(alpha_float,reduce(hfloat[i + 3]));
+      h0 = 0[(__m128i *) &hfloat[i]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
+      h1 = 1[(__m128i *) &hfloat[i]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
+      _mm_storeu_si128(p + (__m128i *) &h[i],_mm_packs_epi32(h0,h1));
+      h0 = 0[(__m128i *) &hfloat[i + 1]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
+      h1 = 1[(__m128i *) &hfloat[i + 1]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
+      _mm_storeu_si128(p + (__m128i *) &h[i + 1],_mm_packs_epi32(h0,h1));
+      h0 = 0[(__m128i *) &hfloat[i + 2]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
+      h1 = 1[(__m128i *) &hfloat[i + 2]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
+      _mm_storeu_si128(p + (__m128i *) &h[i + 2],_mm_packs_epi32(h0,h1));
+      h0 = 0[(__m128i *) &hfloat[i + 3]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
+      h1 = 1[(__m128i *) &hfloat[i + 3]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
+      _mm_storeu_si128(p + (__m128i *) &h[i + 3],_mm_packs_epi32(h0,h1));
+    }
+  }
+}
+
+/* int16 i of output x[j] is int16 j of input x[i] */
+static void transpose16(__m256i x[16])
+{
+  const static int rev[4] = {0,4,2,6};
+  int i;
+  __m256i y[16];
+
+  for (i = 0;i < 16;i += 4) {
+    __m256i a0 = x[i]; 
+    __m256i a1 = x[i + 1];
+    __m256i a2 = x[i + 2]; 
+    __m256i a3 = x[i + 3];
+    __m256i b0 = _mm256_unpacklo_epi16(a0,a1);
+    __m256i b1 = _mm256_unpackhi_epi16(a0,a1);
+    __m256i b2 = _mm256_unpacklo_epi16(a2,a3);
+    __m256i b3 = _mm256_unpackhi_epi16(a2,a3);
+    __m256i c0 = _mm256_unpacklo_epi32(b0,b2);
+    __m256i c2 = _mm256_unpackhi_epi32(b0,b2);
+    __m256i c1 = _mm256_unpacklo_epi32(b1,b3);
+    __m256i c3 = _mm256_unpackhi_epi32(b1,b3);
+    y[i] = c0;
+    y[i + 2] = c2;
+    y[i + 1] = c1;
+    y[i + 3] = c3;
+  }
+  for (i = 0;i < 4;++i) {
+    int r = rev[i];
+    __m256i c0 = y[i];
+    __m256i c4 = y[i + 4];
+    __m256i c8 = y[i + 8];
+    __m256i c12 = y[i + 12];
+    __m256i d0 = _mm256_unpacklo_epi64(c0,c4);
+    __m256i d4 = _mm256_unpackhi_epi64(c0,c4);
+    __m256i d8 = _mm256_unpacklo_epi64(c8,c12);
+    __m256i d12 = _mm256_unpackhi_epi64(c8,c12);
+    __m256i e0 = _mm256_permute2x128_si256(d0,d8,0x20);
+    __m256i e8 = _mm256_permute2x128_si256(d0,d8,0x31);
+    __m256i e4 = _mm256_permute2x128_si256(d4,d12,0x20);
+    __m256i e12 = _mm256_permute2x128_si256(d4,d12,0x31);
+    x[r] = e0;
+    x[r + 8] = e8;
+    x[r + 1] = e4;
+    x[r + 9] = e12;
+  }
+}
+
+/* byte i of output x[j] is byte j of input x[i] */
+static void transpose32(__m256i x[32])
+{
+  const static int rev[4] = {0,8,4,12};
+  int i;
+  __m256i y[32];
+
+  for (i = 0;i < 32;i += 4) {
+    __m256i a0 = x[i]; 
+    __m256i a1 = x[i + 1];
+    __m256i a2 = x[i + 2];
+    __m256i a3 = x[i + 3];
+    __m256i b0 = _mm256_unpacklo_epi8(a0,a1);
+    __m256i b1 = _mm256_unpackhi_epi8(a0,a1);
+    __m256i b2 = _mm256_unpacklo_epi8(a2,a3);
+    __m256i b3 = _mm256_unpackhi_epi8(a2,a3);
+    __m256i c0 = _mm256_unpacklo_epi16(b0,b2);
+    __m256i c2 = _mm256_unpackhi_epi16(b0,b2);
+    __m256i c1 = _mm256_unpacklo_epi16(b1,b3);
+    __m256i c3 = _mm256_unpackhi_epi16(b1,b3);
+    y[i] = c0;
+    y[i + 2] = c2;
+    y[i + 1] = c1;
+    y[i + 3] = c3;
+  }
+  for (i = 0;i < 4;++i) {
+    int r = rev[i];
+    __m256i c0 = y[i];
+    __m256i c8 = y[i + 8];
+    __m256i c16 = y[i + 16];
+    __m256i c24 = y[i + 24];
+    __m256i c4 = y[i + 4];
+    __m256i c12 = y[i + 12];
+    __m256i c20 = y[i + 20];
+    __m256i c28 = y[i + 28];
+    __m256i d0 = _mm256_unpacklo_epi32(c0,c4);
+    __m256i d4 = _mm256_unpackhi_epi32(c0,c4);
+    __m256i d8 = _mm256_unpacklo_epi32(c8,c12);
+    __m256i d12 = _mm256_unpackhi_epi32(c8,c12);
+    __m256i d16 = _mm256_unpacklo_epi32(c16,c20);
+    __m256i d20 = _mm256_unpackhi_epi32(c16,c20);
+    __m256i d24 = _mm256_unpacklo_epi32(c24,c28);
+    __m256i d28 = _mm256_unpackhi_epi32(c24,c28);
+    __m256i e0 = _mm256_unpacklo_epi64(d0,d8);
+    __m256i e8 = _mm256_unpackhi_epi64(d0,d8);
+    __m256i e16 = _mm256_unpacklo_epi64(d16,d24);
+    __m256i e24 = _mm256_unpackhi_epi64(d16,d24);
+    __m256i e4 = _mm256_unpacklo_epi64(d4,d12);
+    __m256i e12 = _mm256_unpackhi_epi64(d4,d12);
+    __m256i e20 = _mm256_unpacklo_epi64(d20,d28);
+    __m256i e28 = _mm256_unpackhi_epi64(d20,d28);
+    __m256i f0 = _mm256_permute2x128_si256(e0,e16,0x20);
+    __m256i f16 = _mm256_permute2x128_si256(e0,e16,0x31);
+    __m256i f8 = _mm256_permute2x128_si256(e8,e24,0x20);
+    __m256i f24 = _mm256_permute2x128_si256(e8,e24,0x31);
+    __m256i f4 = _mm256_permute2x128_si256(e4,e20,0x20);
+    __m256i f20 = _mm256_permute2x128_si256(e4,e20,0x31);
+    __m256i f12 = _mm256_permute2x128_si256(e12,e28,0x20);
+    __m256i f28 = _mm256_permute2x128_si256(e12,e28,0x31);
+    x[r] = f0;
+    x[r + 16] = f16;
+    x[r + 1] = f8;
+    x[r + 17] = f24;
+    x[r + 2] = f4;
+    x[r + 18] = f20;
+    x[r + 3] = f12;
+    x[r + 19] = f28;
+  }
+}
+
+/* 48*16*int16 f inputs between -2295 and 2295 */
+/* 24*32*int8 g inputs between -1 and 1 */
+/* 96*16*int16 h outputs between -2295 and 2295 */
+static void mult768_mix2_m256i(__m256i h[96],const __m256i f[48],const __m256i g[24])
+{
+  __m256i hkara[24][16];
+  __m256i gkara[3][32];
+#define fkara hkara
+  int i;
+
+  for (i = 6;i-- > 0;) {
+    __m256i f0, f1, f2, f3, f4, f5, f6, f7;
+    __m256i f01, f23, f45, f67;
+    __m256i f02, f46, f04, f26, f0426;
+    __m256i f13, f57, f15, f37, f1537;
+    __m256i f0213, f4657, f04261537, f0415, f2637;
+
+    f0 = _mm256_loadu_si256(&f[i + 0]);
+    f1 = _mm256_loadu_si256(&f[i + 6]);
+    f2 = _mm256_loadu_si256(&f[i + 12]);
+    f3 = _mm256_loadu_si256(&f[i + 18]);
+    f4 = _mm256_loadu_si256(&f[i + 24]);
+    f5 = _mm256_loadu_si256(&f[i + 30]);
+    f6 = _mm256_loadu_si256(&f[i + 36]);
+    f7 = _mm256_loadu_si256(&f[i + 42]);
+    f01 = squeezeadd16(f0,f1); fkara[i][8] = f01;
+    f23 = squeezeadd16(f2,f3); fkara[i][9] = f23;
+    f45 = squeezeadd16(f4,f5); fkara[i][10] = f45;
+    f67 = squeezeadd16(f6,f7); fkara[i][11] = f67;
+
+    fkara[i][0] = f0;
+    fkara[i][2] = f2;
+    fkara[i][4] = f4;
+    fkara[i][6] = f6;
+
+    f02 = squeezeadd16(f0,f2); fkara[i + 6][0] = f02;
+    f04 = squeezeadd16(f0,f4); fkara[i + 6][6] = f04;
+    f46 = squeezeadd16(f4,f6); fkara[i + 6][3] = f46;
+    f26 = squeezeadd16(f2,f6); fkara[i + 6][8] = f26;
+
+    fkara[i][1] = f1;
+    fkara[i][3] = f3;
+    fkara[i][5] = f5;
+    fkara[i][7] = f7;
+
+    f13 = squeezeadd16(f1,f3); fkara[i + 6][1] = f13;
+    f15 = squeezeadd16(f1,f5); fkara[i + 6][7] = f15;
+    f57 = squeezeadd16(f5,f7); fkara[i + 6][4] = f57;
+    f37 = squeezeadd16(f3,f7); fkara[i + 6][9] = f37;
+
+    f0426 = squeezeadd16(f04,f26); fkara[i + 6][12] = f0426;
+    f1537 = squeezeadd16(f15,f37); fkara[i + 6][13] = f1537;
+    f0213 = squeezeadd16(f02,f13); fkara[i + 6][2] = f0213;
+    f4657 = squeezeadd16(f46,f57); fkara[i + 6][5] = f4657;
+    f0415 = squeezeadd16(f04,f15); fkara[i + 6][10] = f0415;
+    f2637 = squeezeadd16(f26,f37); fkara[i + 6][11] = f2637;
+    f04261537 = squeezeadd16(f0426,f1537); fkara[i + 6][14] = f04261537;
+
+    fkara[i][12] = v0;
+    fkara[i][13] = v0;
+    fkara[i][14] = v0;
+    fkara[i][15] = v0;
+    fkara[i + 6][15] = v0;
+  }
+
+  for (i = 3;i-- > 0;) {
+    __m256i g0, g1, g2, g3, g4, g5, g6, g7;
+    __m256i g01, g23, g45, g67;
+    __m256i g02, g46, g04, g26, g0426;
+    __m256i g13, g57, g15, g37, g1537;
+    __m256i g0213, g4657, g04261537, g0415, g2637;
+
+    g0 = _mm256_loadu_si256(&g[i + 0]);
+    g1 = _mm256_loadu_si256(&g[i + 3]);
+    g2 = _mm256_loadu_si256(&g[i + 6]);
+    g3 = _mm256_loadu_si256(&g[i + 9]);
+    g4 = _mm256_loadu_si256(&g[i + 12]);
+    g5 = _mm256_loadu_si256(&g[i + 15]);
+    g6 = _mm256_loadu_si256(&g[i + 18]);
+    g7 = _mm256_loadu_si256(&g[i + 21]);
+    g01 = _mm256_add_epi8(g0,g1); gkara[i][8] = g01;
+    g23 = _mm256_add_epi8(g2,g3); gkara[i][9] = g23;
+    g45 = _mm256_add_epi8(g4,g5); gkara[i][10] = g45;
+    g67 = _mm256_add_epi8(g6,g7); gkara[i][11] = g67;
+
+    gkara[i][0] = g0;
+    gkara[i][2] = g2;
+    gkara[i][4] = g4;
+    gkara[i][6] = g6;
+
+    g02 = _mm256_add_epi8(g0,g2); gkara[i][16] = g02;
+    g04 = _mm256_add_epi8(g0,g4); gkara[i][22] = g04;
+    g46 = _mm256_add_epi8(g4,g6); gkara[i][19] = g46;
+    g26 = _mm256_add_epi8(g2,g6); gkara[i][24] = g26;
+
+    gkara[i][1] = g1;
+    gkara[i][3] = g3;
+    gkara[i][5] = g5;
+    gkara[i][7] = g7;
+
+    g13 = _mm256_add_epi8(g1,g3); gkara[i][17] = g13;
+    g15 = _mm256_add_epi8(g1,g5); gkara[i][23] = g15;
+    g57 = _mm256_add_epi8(g5,g7); gkara[i][20] = g57;
+    g37 = _mm256_add_epi8(g3,g7); gkara[i][25] = g37;
+
+    g0426 = _mm256_add_epi8(g04,g26); gkara[i][28] = g0426;
+    g1537 = _mm256_add_epi8(g15,g37); gkara[i][29] = g1537;
+    g0213 = _mm256_add_epi8(g02,g13); gkara[i][18] = g0213;
+    g4657 = _mm256_add_epi8(g46,g57); gkara[i][21] = g4657;
+    g0415 = _mm256_add_epi8(g04,g15); gkara[i][26] = g0415;
+    g2637 = _mm256_add_epi8(g26,g37); gkara[i][27] = g2637;
+    g04261537 = _mm256_add_epi8(g0426,g1537); gkara[i][30] = g04261537;
+
+    gkara[i][12] = v0;
+    gkara[i][13] = v0;
+    gkara[i][14] = v0;
+    gkara[i][15] = v0;
+    gkara[i][31] = v0;
+  }
+
+  for (i = 12;i-- > 0;)
+    transpose16(fkara[i]);
+  for (i = 3;i-- > 0;)
+    transpose32(gkara[i]);
+
+  mult96x16(hkara[12],fkara[6],(__m256i *) (1 + (__m128i *) gkara));
+  mult96x16(hkara[0],fkara[0],gkara[0]);
+
+  for (i = 24;i-- > 0;)
+    transpose16(hkara[i]);
+
+  for (i = 6;i-- > 0;) {
+    __m256i h0,h1,h2,h3,h4,h5,h6,h7,h8,h9;
+    __m256i h10,h11,h12,h13,h14,h15,h16,h17,h18,h19;
+    __m256i h20,h21,h22,h23;
+    __m256i h32,h33,h34,h35,h36,h37,h38,h39;
+    __m256i h40,h41,h42,h43,h44,h45,h46,h47,h48,h49;
+    __m256i h50,h51,h52,h53,h54,h55,h56,h57,h58,h59;
+    __m256i h60,h61;
+    __m256i c;
+
+#define COMBINE(h0,h1,h2,h3,x0,x1) \
+    c = _mm256_sub_epi16(h1,h2); \
+    h1 = _mm256_sub_epi16(_mm256_add_epi16(c,x0),h0); \
+    h2 = _mm256_sub_epi16(x1,_mm256_add_epi16(c,h3)); \
+    h1 = squeeze(h1); \
+    h2 = squeeze(h2);
+
+    h56 = hkara[i + 12][12];
+    h57 = hkara[i + 18][12];
+    h58 = hkara[i + 12][13];
+    h59 = hkara[i + 18][13];
+    h60 = hkara[i + 12][14];
+    h61 = hkara[i + 18][14];
+    COMBINE(h56,h57,h58,h59,h60,h61)
+
+    h44 = hkara[i + 12][6];
+    h45 = hkara[i + 18][6];
+    h46 = hkara[i + 12][7];
+    h47 = hkara[i + 18][7];
+    h52 = hkara[i + 12][10];
+    h53 = hkara[i + 18][10];
+    COMBINE(h44,h45,h46,h47,h52,h53)
+
+    h48 = hkara[i + 12][8];
+    h49 = hkara[i + 18][8];
+    h50 = hkara[i + 12][9];
+    h51 = hkara[i + 18][9];
+    h54 = hkara[i + 12][11];
+    h55 = hkara[i + 18][11];
+    COMBINE(h48,h49,h50,h51,h54,h55)
+    COMBINE(h44,h46,h48,h50,h56,h58)
+    COMBINE(h45,h47,h49,h51,h57,h59)
+
+    h0 = hkara[i][0];
+    h1 = hkara[i + 6][0];
+    h2 = hkara[i][1];
+    h3 = hkara[i + 6][1];
+    h16 = hkara[i][8];
+    h17 = hkara[i + 6][8];
+    COMBINE(h0,h1,h2,h3,h16,h17)
+
+    h4 = hkara[i][2];
+    h5 = hkara[i + 6][2];
+    h6 = hkara[i][3];
+    h7 = hkara[i + 6][3];
+    h18 = hkara[i][9];
+    h19 = hkara[i + 6][9];
+    COMBINE(h4,h5,h6,h7,h18,h19)
+
+    h32 = hkara[i + 12][0];
+    h33 = hkara[i + 18][0];
+    h34 = hkara[i + 12][1];
+    h35 = hkara[i + 18][1];
+    h36 = hkara[i + 12][2];
+    h37 = hkara[i + 18][2];
+    COMBINE(h32,h33,h34,h35,h36,h37)
+    COMBINE(h1,h3,h5,h7,h33,h35)
+    COMBINE(h0,h2,h4,h6,h32,h34)
+
+    h8 = hkara[i][4];
+    h9 = hkara[i + 6][4];
+    h10 = hkara[i][5];
+    h11 = hkara[i + 6][5];
+    h20 = hkara[i][10];
+    h21 = hkara[i + 6][10];
+    COMBINE(h8,h9,h10,h11,h20,h21)
+
+    h12 = hkara[i][6];
+    h13 = hkara[i + 6][6];
+    h14 = hkara[i][7];
+    h15 = hkara[i + 6][7];
+    h22 = hkara[i][11];
+    h23 = hkara[i + 6][11];
+    COMBINE(h12,h13,h14,h15,h22,h23)
+
+    h38 = hkara[i + 12][3];
+    h39 = hkara[i + 18][3];
+    h40 = hkara[i + 12][4];
+    h41 = hkara[i + 18][4];
+    h42 = hkara[i + 12][5];
+    h43 = hkara[i + 18][5];
+    COMBINE(h38,h39,h40,h41,h42,h43)
+    COMBINE(h8,h10,h12,h14,h38,h40)
+    COMBINE(h9,h11,h13,h15,h39,h41)
+
+    COMBINE(h0,h4,h8,h12,h44,h48)
+    h0 = freeze(h0);
+    h4 = freeze(h4);
+    h8 = freeze(h8);
+    h12 = freeze(h12);
+    _mm256_storeu_si256(&h[i + 0],h0);
+    _mm256_storeu_si256(&h[i + 24],h4);
+    _mm256_storeu_si256(&h[i + 48],h8);
+    _mm256_storeu_si256(&h[i + 72],h12);
+
+    COMBINE(h1,h5,h9,h13,h45,h49)
+    h1 = freeze(h1);
+    h5 = freeze(h5);
+    h9 = freeze(h9);
+    h13 = freeze(h13);
+    _mm256_storeu_si256(&h[i + 6],h1);
+    _mm256_storeu_si256(&h[i + 30],h5);
+    _mm256_storeu_si256(&h[i + 54],h9);
+    _mm256_storeu_si256(&h[i + 78],h13);
+
+    COMBINE(h2,h6,h10,h14,h46,h50)
+    h2 = freeze(h2);
+    h6 = freeze(h6);
+    h10 = freeze(h10);
+    h14 = freeze(h14);
+    _mm256_storeu_si256(&h[i + 12],h2);
+    _mm256_storeu_si256(&h[i + 36],h6);
+    _mm256_storeu_si256(&h[i + 60],h10);
+    _mm256_storeu_si256(&h[i + 84],h14);
+
+    COMBINE(h3,h7,h11,h15,h47,h51)
+    h3 = freeze(h3);
+    h7 = freeze(h7);
+    h11 = freeze(h11);
+    h15 = freeze(h15);
+    _mm256_storeu_si256(&h[i + 18],h3);
+    _mm256_storeu_si256(&h[i + 42],h7);
+    _mm256_storeu_si256(&h[i + 66],h11);
+    _mm256_storeu_si256(&h[i + 90],h15);
+  }
+}
+
+#define p 761
+
+/* 761 f inputs between -2295 and 2295 */
+/* 761 g inputs between -1 and 1 */
+/* 761 h outputs between -2295 and 2295 */
+void rq_mult(modq *h,const modq *f,const small *g)
+{
+  __m256i fgvec[96];
+  modq *fg;
+  int i;
+
+  mult768_mix2_m256i(fgvec,(__m256i *) f,(__m256i *) g);
+  fg = (modq *) fgvec;
+
+  h[0] = modq_freeze(fg[0] + fg[p]);
+  for (i = 1;i < 9;++i)
+    h[i] = modq_freeze(fg[i] + fg[i + p - 1] + fg[i + p]);
+  for (i = 9;i < 761;i += 16) {
+    __m256i fgi = _mm256_loadu_si256((__m256i *) &fg[i]);
+    __m256i fgip = _mm256_loadu_si256((__m256i *) &fg[i + p]);
+    __m256i fgip1 = _mm256_loadu_si256((__m256i *) &fg[i + p - 1]);
+    __m256i x = _mm256_add_epi16(fgi,_mm256_add_epi16(fgip,fgip1));
+    x = freeze(squeeze(x));
+    _mm256_storeu_si256((__m256i *) &h[i],x);
+  }
+  for (i = 761;i < 768;++i)
+    h[i] = 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/params.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/params.h
@ -0,0 +1,15 @@
+#ifndef params_h
+#define params_h
+
+#define q 4591
+/* XXX: also built into modq in various ways */
+
+#define qshift 2295
+#define p 761
+#define w 250
+
+#define rq_encode_len 1218
+#define rq_encoderounded_len 1015
+#define small_encode_len 191
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/randomweightw.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/randomweightw.c
@ -0,0 +1,29 @@
+#include "params.h"
+#include "randombytes.h"
+#include "int32_sort.h"
+#include "small.h"
+#include "crypto_stream_aes256ctr.h"
+
+static const unsigned char n[16] = {0};
+
+void small_seeded_weightw(small *f,const unsigned char *k)
+{
+  crypto_int32 r[768];
+  int i;
+
+  crypto_stream_aes256ctr((unsigned char *) r,sizeof r,n,k);
+  for (i = 0;i < p;++i) r[i] ^= 0x80000000;
+
+  for (i = 0;i < w;++i) r[i] &= -2;
+  for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1;
+  int32_sort(r,p);
+  for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1;
+  for (i = p;i < 768;++i) f[i] = 0;
+}
+
+void small_random_weightw(small *f)
+{
+  unsigned char k[32];
+  randombytes(k,32);
+  small_seeded_weightw(f,k);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq.h
@ -0,0 +1,37 @@
+#ifndef rq_h
+#define rq_h
+
+#include "modq.h"
+#include "small.h"
+
+#define rq_encode crypto_kem_ntrulpr4591761_avx_rq_encode
+extern void rq_encode(unsigned char *,const modq *);
+
+#define rq_decode crypto_kem_ntrulpr4591761_avx_rq_decode
+extern void rq_decode(modq *,const unsigned char *);
+
+#define rq_roundencode crypto_kem_ntrulpr4591761_avx_rq_roundencode
+extern void rq_roundencode(unsigned char *,const modq *);
+
+#define rq_decoderounded crypto_kem_ntrulpr4591761_avx_rq_decoderounded
+extern void rq_decoderounded(modq *,const unsigned char *);
+
+#define rq_round3 crypto_kem_ntrulpr4591761_avx_rq_round
+extern void rq_round3(modq *,const modq *);
+
+#define rq_mult crypto_kem_ntrulpr4591761_avx_rq_mult
+extern void rq_mult(modq *,const modq *,const small *);
+
+#define rq_recip3 crypto_kem_ntrulpr4591761_avx_rq_recip3
+int rq_recip3(modq *,const small *);
+
+#define rq_fromseed crypto_kem_ntrulpr4591761_avx_rq_fromseed
+extern void rq_fromseed(modq *,const unsigned char *);
+
+#define rq_top crypto_kem_ntrulpr4591761_avx_rq_top
+extern void rq_top(unsigned char *,const modq *,const unsigned char *);
+
+#define rq_rightsubbit crypto_kem_ntrulpr4591761_avx_rq_rightsubbit
+extern void rq_rightsubbit(unsigned char *,const unsigned char *,const modq *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_fromseed.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_fromseed.c
@ -0,0 +1,21 @@
+#include "crypto_stream_aes256ctr.h"
+#include "rq.h"
+#include "params.h"
+
+static const unsigned char n[16] = {0};
+
+void rq_fromseed(modq *h,const unsigned char *K)
+{
+  crypto_uint32 buf[768];
+  int i;
+
+  crypto_stream_aes256ctr((unsigned char *) buf,sizeof buf,n,K);
+  /* will use 761*4 bytes */
+  /* convenient for aes to generate multiples of 16 bytes */
+  /* and multiples of more for some implementations */
+
+  for (i = 0;i < p;++i)
+    h[i] = modq_fromuint32(buf[i]);
+  for (i = p;i < 768;++i)
+    h[i] = 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_right.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_right.c
@ -0,0 +1,21 @@
+#include "rq.h"
+#include "params.h"
+
+void rq_rightsubbit(unsigned char *r,const unsigned char *c,const modq *ab)
+{
+  modq t[256];
+  int i;
+
+  for (i = 0;i < 128;++i) {
+    crypto_uint32 x = c[i];
+    t[2*i] = (x & 15) * 287 - 2007;
+    t[2*i+1] = (x >> 4) * 287 - 2007;
+  }
+
+  for (i = 0;i < 256;++i)
+    t[i] = -(modq_freeze(t[i] - ab[i] + 4*w+1) >> 14);
+
+  for (i = 0;i < 32;++i) r[i] = 0;
+  for (i = 0;i < 256;++i)
+    r[i / 8] |= (t[i] << (i & 7));
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_rounded.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_rounded.c
@ -0,0 +1,260 @@
+#include <immintrin.h>
+#include "params.h"
+#include "crypto_uint32.h"
+#include "rq.h"
+
+#define alpha_top _mm256_set1_epi32(0x43380000)
+#define alpha _mm256_set1_pd(6755399441055744.0)
+#define v10923_16 _mm256_set1_epi16(10923)
+#define floor(x) _mm256_floor_pd(x)
+
+void rq_roundencode(unsigned char *c,const modq *f)
+{
+  int i;
+  __m256i h[50];
+
+  for (i = 0;i < 208;i += 16) {
+    __m256i a0, a1, a2, b0, b1, b2, c0, c1, c2, d0, d1, d2;
+    __m256i e0, e1, f0, f1, g0, g1;
+    a0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[0]));
+    a1 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[8]));
+    a2 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[16]));
+    a0 = _mm256_inserti128_si256(a0,_mm_loadu_si128((__m128i *) &f[24]),1);
+    a1 = _mm256_inserti128_si256(a1,_mm_loadu_si128((__m128i *) &f[32]),1);
+    a2 = _mm256_inserti128_si256(a2,_mm_loadu_si128((__m128i *) &f[40]),1);
+    f += 48;
+
+    a0 = _mm256_mulhrs_epi16(a0,v10923_16);
+    a1 = _mm256_mulhrs_epi16(a1,v10923_16);
+    a2 = _mm256_mulhrs_epi16(a2,v10923_16);
+
+    /* a0: a0 a1 a2 b0 b1 b2 c0 c1 and similar second half */
+    /* a1: c2 d0 d1 d2 e0 e1 e2 f0 */
+    /* a2: f1 f2 g0 g1 g2 h0 h1 h2 */
+
+    b1 = _mm256_blend_epi16(a2,a0,0xf0);
+    b1 = _mm256_shuffle_epi32(b1,0x4e);
+    b0 = _mm256_blend_epi16(a0,a1,0xf0);
+    b2 = _mm256_blend_epi16(a1,a2,0xf0);
+    /* XXX: use shufps instead? */
+
+    /* b0: a0 a1 a2 b0 e0 e1 e2 f0 */
+    /* b1: b1 b2 c0 c1 f1 f2 g0 g1 */
+    /* b2: c2 d0 d1 d2 g2 h0 h1 h2 */
+
+    c1 = _mm256_blend_epi16(b2,b0,0xcc);
+    c1 = _mm256_shuffle_epi32(c1,0xb1);
+    c0 = _mm256_blend_epi16(b0,b1,0xcc);
+    c2 = _mm256_blend_epi16(b1,b2,0xcc);
+
+    /* c0: a0 a1 c0 c1 e0 e1 g0 g1 */
+    /* c1: a2 b0 c2 d0 e2 f0 g2 h0 */
+    /* c2: b1 b2 d1 d2 f1 f2 h1 h2 */
+
+    d1 = _mm256_blend_epi16(c2,c0,0xaa);
+    d1 = _mm256_shufflelo_epi16(d1,0xb1);
+    d1 = _mm256_shufflehi_epi16(d1,0xb1);
+    d0 = _mm256_blend_epi16(c0,c1,0xaa);
+    d2 = _mm256_blend_epi16(c1,c2,0xaa);
+
+    /* d0: a0 b0 c0 d0 e0 f0 g0 h0 */
+    /* d1: a1 b1 c1 d1 e1 f1 g1 h1 */
+    /* d2: a2 b2 c2 d2 e2 f2 g2 h2 */
+
+    d0 = _mm256_add_epi16(d0,_mm256_set1_epi16(765));
+    d1 = _mm256_add_epi16(d1,_mm256_set1_epi16(765));
+    d2 = _mm256_add_epi16(d2,_mm256_set1_epi16(765));
+    /* want bytes of d0 + 1536*d1 + 1536*1536*d2 */
+
+    e0 = d0 & _mm256_set1_epi16(0xff);
+    d0 = _mm256_srli_epi16(d0,8);
+    /* want e0, d0 + 6*d1 + 6*1536*d2 */
+
+    d1 = _mm256_mullo_epi16(d1,_mm256_set1_epi16(6));
+    d0 = _mm256_add_epi16(d0,d1);
+    /* want e0, d0 + 6*1536*d2 */
+
+    e1 = _mm256_slli_epi16(d0,8);
+    e0 = _mm256_add_epi16(e0,e1);
+    d0 = _mm256_srli_epi16(d0,8);
+    /* want e0, d0 + 36*d2 */
+
+    d2 = _mm256_mullo_epi16(d2,_mm256_set1_epi16(36));
+    e1 = _mm256_add_epi16(d0,d2);
+    /* want e0, e1 */
+
+    /* e0: out0 out1 out4 out5 out8 out9 ... */
+    /* e1: out2 out3 out6 out7 out10 out11 ... */
+
+    f0 = _mm256_unpacklo_epi16(e0,e1);
+    f1 = _mm256_unpackhi_epi16(e0,e1);
+
+    g0 = _mm256_permute2x128_si256(f0,f1,0x20);
+    g1 = _mm256_permute2x128_si256(f0,f1,0x31);
+
+    _mm256_storeu_si256((__m256i *) c,g0);
+    _mm256_storeu_si256((__m256i *) (c + 32),g1);
+    c += 64;
+  }
+
+  for (i = 0;i < 9;++i) {
+    __m256i x = _mm256_loadu_si256((__m256i *) &f[16 * i]);
+    _mm256_storeu_si256(&h[i],_mm256_mulhrs_epi16(x,v10923_16));
+  }
+  f = (const modq *) h;
+
+  for (i = 208;i < 253;++i) {
+    crypto_int32 f0, f1, f2;
+    f0 = *f++;
+    f1 = *f++;
+    f2 = *f++;
+    f0 += 1806037245;
+    f1 *= 3;
+    f2 *= 9;
+    f0 += f1 << 9;
+    f0 += f2 << 18;
+    *(crypto_int32 *) c = f0;
+    c += 4;
+  }
+  {
+    crypto_int32 f0, f1;
+    f0 = *f++;
+    f1 = *f++;
+    f0 += 1175805;
+    f1 *= 3;
+    f0 += f1 << 9;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0;
+  }
+}
+
+void rq_decoderounded(modq *f,const unsigned char *c)
+{
+  crypto_uint32 c0, c1, c2, c3;
+  crypto_uint32 f0, f1, f2;
+  int i;
+
+  for (i = 0;i < 248;i += 8) {
+    __m256i abcdefgh, todo[2];
+    __m256d x, f2, f1, f0;
+    __m128i if2, if1, if0;
+    int j;
+
+    abcdefgh = _mm256_loadu_si256((__m256i *) c);
+    c += 32;
+
+    todo[0] = _mm256_unpacklo_epi32(abcdefgh,alpha_top);
+    todo[1] = _mm256_unpackhi_epi32(abcdefgh,alpha_top);
+
+    for (j = 0;j < 2;++j) {
+      x = *(__m256d *) &todo[j];
+      x -= alpha;
+
+      /* x is f0 + f1*1536 + f2*1536^2 */
+      /* with each f between 0 and 1530 */
+
+      f2 = x * _mm256_set1_pd(0.00000042385525173611114052197733521876177320564238470979034900665283203125);
+      f2 = floor(f2);
+      x -= f2 * _mm256_set1_pd(2359296.0);
+
+      f1 = x * _mm256_set1_pd(0.00065104166666666673894681149903362893383018672466278076171875);
+      f1 = floor(f1);
+      x -= f1 * _mm256_set1_pd(1536.0);
+
+      f0 = x;
+
+      f2 -= _mm256_set1_pd(1531.0) * floor(f2 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875));
+      f1 -= _mm256_set1_pd(1531.0) * floor(f1 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875));
+      f0 -= _mm256_set1_pd(1531.0) * floor(f0 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875));
+
+      f2 *= _mm256_set1_pd(3.0); f2 -= _mm256_set1_pd(2295.0);
+      f1 *= _mm256_set1_pd(3.0); f1 -= _mm256_set1_pd(2295.0);
+      f0 *= _mm256_set1_pd(3.0); f0 -= _mm256_set1_pd(2295.0);
+
+      if2 = _mm256_cvtpd_epi32(f2); /* a2 b2 e2 f2 */
+      if1 = _mm256_cvtpd_epi32(f1); /* a1 b1 e1 f1 */
+      if0 = _mm256_cvtpd_epi32(f0); /* a0 b0 e0 f0 */
+
+      f[6*j + 0] = _mm_extract_epi32(if0,0);
+      f[6*j + 1] = _mm_extract_epi32(if1,0);
+      f[6*j + 2] = _mm_extract_epi32(if2,0);
+      f[6*j + 3] = _mm_extract_epi32(if0,1);
+      f[6*j + 4] = _mm_extract_epi32(if1,1);
+      f[6*j + 5] = _mm_extract_epi32(if2,1);
+
+      f[6*j + 12] = _mm_extract_epi32(if0,2);
+      f[6*j + 13] = _mm_extract_epi32(if1,2);
+      f[6*j + 14] = _mm_extract_epi32(if2,2);
+      f[6*j + 15] = _mm_extract_epi32(if0,3);
+      f[6*j + 16] = _mm_extract_epi32(if1,3);
+      f[6*j + 17] = _mm_extract_epi32(if2,3);
+    }
+
+    f += 24;
+  }
+
+  for (i = 248;i < 253;++i) {
+    c0 = *c++;
+    c1 = *c++;
+    c2 = *c++;
+    c3 = *c++;
+
+    /* f0 + f1*1536 + f2*1536^2 */
+    /* = c0 + c1*256 + c2*256^2 + c3*256^3 */
+    /* with each f between 0 and 1530 */
+
+    /* f2 = (64/9)c3 + (1/36)c2 + (1/9216)c1 + (1/2359296)c0 - [0,0.99675] */
+    /* claim: 2^21 f2 < x < 2^21(f2+1) */
+    /* where x = 14913081*c3 + 58254*c2 + 228*(c1+2) */
+    /* proof: x - 2^21 f2 = 456 - (8/9)c0 + (4/9)c1 - (2/9)c2 + (1/9)c3 + 2^21 [0,0.99675] */
+    /* at least 456 - (8/9)255 - (2/9)255 > 0 */
+    /* at most 456 + (4/9)255 + (1/9)255 + 2^21 0.99675 < 2^21 */
+    f2 = (14913081*c3 + 58254*c2 + 228*(c1+2)) >> 21;
+
+    c2 += c3 << 8;
+    c2 -= (f2 * 9) << 2;
+    /* f0 + f1*1536 */
+    /* = c0 + c1*256 + c2*256^2 */
+    /* c2 <= 35 = floor((1530+1530*1536)/256^2) */
+    /* f1 = (128/3)c2 + (1/6)c1 + (1/1536)c0 - (1/1536)f0 */
+    /* claim: 2^21 f1 < x < 2^21(f1+1) */
+    /* where x = 89478485*c2 + 349525*c1 + 1365*(c0+1) */
+    /* proof: x - 2^21 f1 = 1365 - (1/3)c2 - (1/3)c1 - (1/3)c0 + (4096/3)f0 */
+    /* at least 1365 - (1/3)35 - (1/3)255 - (1/3)255 > 0 */
+    /* at most 1365 + (4096/3)1530 < 2^21 */
+    f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
+
+    c1 += c2 << 8;
+    c1 -= (f1 * 3) << 1;
+
+    c0 += c1 << 8;
+    f0 = c0;
+
+    *f++ = modq_freeze(f0 * 3 + q - qshift);
+    *f++ = modq_freeze(f1 * 3 + q - qshift);
+    *f++ = modq_freeze(f2 * 3 + q - qshift);
+  }
+
+  c0 = *c++;
+  c1 = *c++;
+  c2 = *c++;
+
+  f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
+
+  c1 += c2 << 8;
+  c1 -= (f1 * 3) << 1;
+
+  c0 += c1 << 8;
+  f0 = c0;
+
+  *f++ = modq_freeze(f0 * 3 + q - qshift);
+  *f++ = modq_freeze(f1 * 3 + q - qshift);
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_top.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_top.c
@ -0,0 +1,17 @@
+#include "rq.h"
+
+void rq_top(unsigned char *c,const modq *f,const unsigned char *r)
+{
+  modq T[256];
+  int i;
+
+  for (i = 0;i < 256;++i) {
+    modq x = f[i];
+    x = modq_sum(x,2295 * (1 & (r[i / 8] >> (i & 7))));
+    x = ((x + 2156) * 114 + 16384) >> 15;
+    T[i] = x; /* between 0 and 15 */
+  }
+
+  for (i = 0;i < 128;++i)
+    *c++ = T[2*i] + (T[2*i + 1] << 4);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.c
@ -0,0 +1,44 @@
+#include "params.h"
+#include "small.h"
+
+/* XXX: these functions rely on p mod 4 = 1 */
+
+/* all coefficients in -1, 0, 1 */
+void small_encode(unsigned char *c,const small *f)
+{
+  small c0;
+  int i;
+
+  for (i = 0;i < p/4;++i) {
+    c0 = *f++ + 1;
+    c0 += (*f++ + 1) << 2;
+    c0 += (*f++ + 1) << 4;
+    c0 += (*f++ + 1) << 6;
+    *c++ = c0;
+  }
+  c0 = *f++ + 1;
+  *c++ = c0;
+}
+
+void small_decode(small *f,const unsigned char *c)
+{
+  unsigned char c0;
+  int i;
+
+  for (i = 0;i < p/4;++i) {
+    c0 = *c++;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1;
+  }
+  c0 = *c++;
+  *f++ = ((small) (c0 & 3)) - 1;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.h
@ -0,0 +1,27 @@
+#ifndef small_h
+#define small_h
+
+#include "crypto_int8.h"
+#include "crypto_int32.h"
+
+typedef crypto_int8 small;
+
+#define small_encode crypto_kem_ntrulpr4591761_avx_small_encode
+extern void small_encode(unsigned char *,const small *);
+
+#define small_decode crypto_kem_ntrulpr4591761_avx_small_decode
+extern void small_decode(small *,const unsigned char *);
+
+#define small_random32 crypto_kem_ntrulpr4591761_avx_small_random32
+extern crypto_int32 small_random32(void);
+
+#define small_random crypto_kem_ntrulpr4591761_avx_small_random
+extern void small_random(small *);
+
+#define small_seeded_weightw crypto_kem_ntrulpr4591761_avx_small_seeded_weightw
+extern void small_seeded_weightw(small *,const unsigned char *);
+
+#define small_random_weightw crypto_kem_ntrulpr4591761_avx_small_random_weightw
+extern void small_random_weightw(small *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumbig
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumbig
@ -0,0 +1 @@
+bcc60c85ac6ca2dbbe244878ba9b62019560516e8377aecd890c737bf5dcb05f
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumsmall
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumsmall
@ -0,0 +1 @@
+a13b63e4929ab2ab97f7889f071245113ddd919bdaf1c883e12cd80fdf4f9e3e
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/description
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/description
@ -0,0 +1 @@
+Streamlined NTRU Prime 4591^761
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/designers
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/designers
@ -0,0 +1,5 @@
+Alphabetical order:
+Daniel J. Bernstein
+Chitchanok Chuengsatiansup
+Tanja Lange
+Christine van Vredendaal
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/README
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/README
@ -0,0 +1,30 @@
+This is a reference implementation of NTRU LPRime 4591^761. This
+implementation is designed primarily for clarity, subject to the
+following constraints:
+
+   * The implementation is written in C. We have a separate Sage
+     implementation that is considerably more concise.
+
+   * The implementation avoids data-dependent branches and array
+     indices. For example, conditional swaps are computed by arithmetic
+     rather than by branches.
+
+   * The implementation avoids other C operations that often take
+     variable time. For example, divisions by 3 are computed via
+     multiplications and shifts.
+     
+This implementation does _not_ sacrifice clarity for speed.
+
+This implementation has not yet been reviewed for correctness or for
+constant-time behavior. It does pass various tests and has no known
+bugs, but there are at least some platforms where multiplications take
+variable time, and fixing this requires platform-specific effort; see
+https://www.bearssl.org/ctmul.html and http://repository.tue.nl/800603.
+
+This implementation allows "benign malleability" of ciphertexts, as
+defined in http://www.shoup.net/papers/iso-1_1.pdf. A similar comment
+applies to public keys.
+
+There is a separate "avx" implementation where similar comments apply,
+except that "avx" _does_ sacrifice clarity for speed on CPUs with AVX2
+instructions.
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/api.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/api.h
@ -0,0 +1,4 @@
+#define CRYPTO_SECRETKEYBYTES 1238
+#define CRYPTO_PUBLICKEYBYTES 1047
+#define CRYPTO_CIPHERTEXTBYTES 1175
+#define CRYPTO_BYTES 32
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/dec.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/dec.c
@ -0,0 +1,68 @@
+#ifdef KAT
+#include <stdio.h>
+#endif
+
+#include "params.h"
+#include "small.h"
+#include "rq.h"
+#include "hide.h"
+#include "crypto_kem.h"
+
+static int verify(const unsigned char *x,const unsigned char *y)
+{
+  unsigned int differentbits = 0;
+  int i;
+  for (i = 0;i < crypto_kem_CIPHERTEXTBYTES;++i)
+    differentbits |= x[i] ^ y[i];
+  return (1 & ((differentbits - 1) >> 8)) - 1;
+}
+
+int crypto_kem_dec(
+  unsigned char *k,
+  const unsigned char *cstr,
+  const unsigned char *sk
+)
+{
+  small a[p];
+  modq B[p];
+  modq aB[p];
+  modq C[256];
+  unsigned char r[32];
+  unsigned char checkcstr[crypto_kem_CIPHERTEXTBYTES];
+  unsigned char maybek[32];
+  int i;
+  int result;
+
+  small_decode(a,sk); sk += small_encode_len;
+  rq_decoderounded(B,cstr + 32);
+  rq_mult(aB,B,a);
+
+  for (i = 0;i < 128;++i) {
+    crypto_uint32 x = cstr[32 + rq_encoderounded_len + i];
+    C[2*i] = (x & 15) * 287 - 2007;
+    C[2*i+1] = (x >> 4) * 287 - 2007;
+  }
+
+  for (i = 0;i < 256;++i)
+    C[i] = -(modq_freeze(C[i] - aB[i] + 4*w+1) >> 14);
+
+  for (i = 0;i < 32;++i) r[i] = 0;
+  for (i = 0;i < 256;++i)
+    r[i / 8] |= (C[i] << (i & 7));
+
+#ifdef KAT
+  {
+    int j;
+    printf("decrypt r: ");
+    for (j = 0;j < 32;++j)
+      printf("%02x",255 & (int) r[j]);
+    printf("\n");
+  }
+#endif
+
+  hide(checkcstr,maybek,sk,r);
+  result = verify(cstr,checkcstr);
+
+  for (i = 0;i < 32;++i) k[i] = maybek[i] & ~result;
+  return result;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/enc.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/enc.c
@ -0,0 +1,30 @@
+#ifdef KAT
+#include <stdio.h>
+#endif
+
+#include "hide.h"
+#include "randombytes.h"
+#include "crypto_kem.h"
+
+int crypto_kem_enc(
+  unsigned char *cstr,
+  unsigned char *k,
+  const unsigned char *pk
+)
+{
+  unsigned char r[32];
+  randombytes(r,32);
+
+#ifdef KAT
+  {
+    int i;
+    printf("encrypt r: ");
+    for (i = 0;i < 32;++i)
+      printf("%02x",255 & (int) r[i]);
+    printf("\n");
+  }
+#endif
+
+  hide(cstr,k,pk,r);
+  return 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.c
@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <string.h>
+#include "crypto_hash_sha512.h"
+#include "crypto_kem.h"
+#include "params.h"
+#include "rq.h"
+#include "hide.h"
+
+#if crypto_kem_CIPHERTEXTBYTES != rq_encoderounded_len + 32 + 128
+#error "crypto_kem_CIPHERTEXTBYTES must match rq_encoderounded_len + 32 + 128"
+#endif
+
+void hide(unsigned char *cstr,unsigned char *k,const unsigned char *pk,const unsigned char *r)
+{
+  modq G[p];
+  modq A[p];
+  unsigned char k12[64];
+  unsigned char k34[64];
+  small b[p];
+  modq B[p];
+  modq C[p];
+  int i;
+
+  rq_fromseed(G,pk);
+  rq_decoderounded(A,pk + 32);
+
+  crypto_hash_sha512(k12,r,32);
+  small_seeded_weightw(b,k12);
+  crypto_hash_sha512(k34,k12 + 32,32);
+
+  rq_mult(B,G,b);
+  rq_round3(B,B);
+
+  rq_mult(C,A,b);
+  for (i = 0;i < 256;++i) {
+    modq x = C[i];
+    x = modq_sum(x,2295 * (1 & (r[i / 8] >> (i & 7))));
+    x = ((x + 2156) * 114 + 16384) >> 15;
+    C[i] = x; /* between 0 and 15 */
+  }
+
+  memcpy(cstr,k34,32); cstr += 32;
+  memcpy(k,k34 + 32,32);
+
+  rq_encoderounded(cstr,B); cstr += rq_encoderounded_len;
+
+  for (i = 0;i < 128;++i)
+    *cstr++ = C[2*i] + (C[2*i + 1] << 4);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.h
@ -0,0 +1,9 @@
+#ifndef hide_h
+#define hide_h
+
+#include "crypto_int32.h"
+
+#define hide crypto_kem_ntrulpr4591761_ref_hide
+extern void hide(unsigned char *,unsigned char *,const unsigned char *,const unsigned char *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/implementors
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/implementors
@ -0,0 +1,5 @@
+Alphabetical order:
+Daniel J. Bernstein
+Chitchanok Chuengsatiansup
+Tanja Lange
+Christine van Vredendaal
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.c
@ -0,0 +1,35 @@
+#include "int32_sort.h"
+#include "crypto_uint32.h"
+
+static void minmax(crypto_int32 *x,crypto_int32 *y)
+{
+  crypto_uint32 xi = *x;
+  crypto_uint32 yi = *y;
+  crypto_uint32 xy = xi ^ yi;
+  crypto_uint32 c = yi - xi;
+  c ^= xy & (c ^ yi);
+  c >>= 31;
+  c = -c;
+  c &= xy;
+  *x = xi ^ c;
+  *y = yi ^ c;
+}
+
+void int32_sort(crypto_int32 *x,int n)
+{
+  int top,p,q,i;
+
+  if (n < 2) return;
+  top = 1;
+  while (top < n - top) top += top;
+
+  for (p = top;p > 0;p >>= 1) {
+    for (i = 0;i < n - p;++i)
+      if (!(i & p))
+        minmax(x + i,x + i + p);
+    for (q = top;q > p;q >>= 1)
+      for (i = 0;i < n - q;++i)
+        if (!(i & p))
+          minmax(x + i + p,x + i + q);
+  }
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.h
@ -0,0 +1,9 @@
+#ifndef int32_sort_h
+#define int32_sort_h
+
+#include "crypto_int32.h"
+
+#define int32_sort crypto_kem_ntrulpr4591761_ref_int32_sort
+extern void int32_sort(crypto_int32 *,int);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/keypair.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/keypair.c
@ -0,0 +1,39 @@
+#include <string.h>
+#include "modq.h"
+#include "params.h"
+#include "small.h"
+#include "rq.h"
+#include "crypto_kem.h"
+#include "randombytes.h"
+#include "crypto_stream_aes256ctr.h"
+
+#if crypto_kem_PUBLICKEYBYTES != rq_encoderounded_len + 32
+#error "crypto_kem_PUBLICKEYBYTES must match rq_encoderounded_len + 32"
+#endif
+#if crypto_kem_SECRETKEYBYTES != small_encode_len + crypto_kem_PUBLICKEYBYTES
+#error "crypto_kem_SECRETKEYBYTES must match small_encode_len + crypto_kem_PUBLICKEYBYTES"
+#endif
+
+int crypto_kem_keypair(unsigned char *pk,unsigned char *sk)
+{
+  unsigned char K[32];
+  modq G[p];
+  small a[p];
+  modq A[p];
+
+  randombytes(K,32);
+  rq_fromseed(G,K);
+
+  small_random_weightw(a);
+
+  rq_mult(A,G,a);
+  rq_round3(A,A);
+
+  memcpy(pk,K,32);
+  rq_encoderounded(pk + 32,A);
+
+  small_encode(sk,a);
+  memcpy(sk + small_encode_len,pk,crypto_kem_PUBLICKEYBYTES);
+
+  return 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/modq.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/modq.h
@ -0,0 +1,44 @@
+#ifndef modq_h
+#define modq_h
+
+#include "crypto_int16.h"
+#include "crypto_int32.h"
+#include "crypto_uint16.h"
+#include "crypto_uint32.h"
+
+typedef crypto_int16 modq;
+
+/* input between -9000000 and 9000000 */
+/* output between -2295 and 2295 */
+static inline modq modq_freeze(crypto_int32 a)
+{
+  a -= 4591 * ((228 * a) >> 20);
+  a -= 4591 * ((58470 * a + 134217728) >> 28);
+  return a;
+}
+
+/* input between 0 and 4294967295 */
+/* output = (input % 4591) - 2295 */
+static inline modq modq_fromuint32(crypto_uint32 a)
+{
+  crypto_int32 r;
+  r = (a & 524287) + (a >> 19) * 914; /* <= 8010861 */
+  return modq_freeze(r - 2295);
+}
+
+static inline modq modq_plusproduct(modq a,modq b,modq c)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  crypto_int32 C = c;
+  return modq_freeze(A + B * C);
+}
+
+static inline modq modq_sum(modq a,modq b)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  return modq_freeze(A + B);
+}
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/params.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/params.h
@ -0,0 +1,15 @@
+#ifndef params_h
+#define params_h
+
+#define q 4591
+/* XXX: also built into modq in various ways */
+
+#define qshift 2295
+#define p 761
+#define w 250
+
+#define rq_encode_len 1218
+#define rq_encoderounded_len 1015
+#define small_encode_len 191
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/randomweightw.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/randomweightw.c
@ -0,0 +1,28 @@
+#include "params.h"
+#include "randombytes.h"
+#include "int32_sort.h"
+#include "small.h"
+#include "crypto_stream_aes256ctr.h"
+
+static const unsigned char n[16] = {0};
+
+void small_seeded_weightw(small *f,const unsigned char *k)
+{
+  crypto_int32 r[p];
+  int i;
+
+  crypto_stream_aes256ctr((unsigned char *) r,sizeof r,n,k);
+  for (i = 0;i < p;++i) r[i] ^= 0x80000000;
+
+  for (i = 0;i < w;++i) r[i] &= -2;
+  for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1;
+  int32_sort(r,p);
+  for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1;
+}
+
+void small_random_weightw(small *f)
+{
+  unsigned char k[32];
+  randombytes(k,32);
+  small_seeded_weightw(f,k);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq.h
@ -0,0 +1,31 @@
+#ifndef rq_h
+#define rq_h
+
+#include "modq.h"
+#include "small.h"
+
+#define rq_encode crypto_kem_ntrulpr4591761_ref_rq_encode
+extern void rq_encode(unsigned char *,const modq *);
+
+#define rq_decode crypto_kem_ntrulpr4591761_ref_rq_decode
+extern void rq_decode(modq *,const unsigned char *);
+
+#define rq_encoderounded crypto_kem_ntrulpr4591761_ref_rq_encoderounded
+extern void rq_encoderounded(unsigned char *,const modq *);
+
+#define rq_decoderounded crypto_kem_ntrulpr4591761_ref_rq_decoderounded
+extern void rq_decoderounded(modq *,const unsigned char *);
+
+#define rq_round3 crypto_kem_ntrulpr4591761_ref_rq_round
+extern void rq_round3(modq *,const modq *);
+
+#define rq_mult crypto_kem_ntrulpr4591761_ref_rq_mult
+extern void rq_mult(modq *,const modq *,const small *);
+
+#define rq_recip3 crypto_kem_ntrulpr4591761_ref_rq_recip3
+int rq_recip3(modq *,const small *);
+
+#define rq_fromseed crypto_kem_ntrulpr4591761_ref_rq_fromseed
+extern void rq_fromseed(modq *,const unsigned char *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_fromseed.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_fromseed.c
@ -0,0 +1,15 @@
+#include "crypto_stream_aes256ctr.h"
+#include "rq.h"
+#include "params.h"
+
+static const unsigned char n[16] = {0};
+
+void rq_fromseed(modq *h,const unsigned char *K)
+{
+  crypto_uint32 buf[p];
+  int i;
+
+  crypto_stream_aes256ctr((unsigned char *) buf,sizeof buf,n,K);
+  for (i = 0;i < p;++i)
+    h[i] = modq_fromuint32(buf[i]);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_mult.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_mult.c
@ -0,0 +1,30 @@
+#include "params.h"
+#include "rq.h"
+
+void rq_mult(modq *h,const modq *f,const small *g)
+{
+  modq fg[p + p - 1];
+  modq result;
+  int i, j;
+
+  for (i = 0;i < p;++i) {
+    result = 0;
+    for (j = 0;j <= i;++j)
+      result = modq_plusproduct(result,f[j],g[i - j]);
+    fg[i] = result;
+  }
+  for (i = p;i < p + p - 1;++i) {
+    result = 0;
+    for (j = i - p + 1;j < p;++j)
+      result = modq_plusproduct(result,f[j],g[i - j]);
+    fg[i] = result;
+  }
+
+  for (i = p + p - 2;i >= p;--i) {
+    fg[i - p] = modq_sum(fg[i - p],fg[i]);
+    fg[i - p + 1] = modq_sum(fg[i - p + 1],fg[i]);
+  }
+
+  for (i = 0;i < p;++i)
+    h[i] = fg[i];
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_round3.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_round3.c
@ -0,0 +1,10 @@
+#include "params.h"
+#include "rq.h"
+
+void rq_round3(modq *h,const modq *f)
+{
+  int i;
+
+  for (i = 0;i < p;++i)
+    h[i] = ((21846 * (f[i] + 2295) + 32768) >> 16) * 3 - 2295;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_rounded.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_rounded.c
@ -0,0 +1,101 @@
+#include "params.h"
+#include "crypto_uint32.h"
+#include "rq.h"
+
+void rq_encoderounded(unsigned char *c,const modq *f)
+{
+  crypto_int32 f0, f1, f2;
+  int i;
+
+  for (i = 0;i < p/3;++i) {
+    f0 = *f++ + qshift;
+    f1 = *f++ + qshift;
+    f2 = *f++ + qshift;
+    f0 = (21846 * f0) >> 16;
+    f1 = (21846 * f1) >> 16;
+    f2 = (21846 * f2) >> 16;
+    /* now want f0 + f1*1536 + f2*1536^2 as a 32-bit integer */
+    f2 *= 3;
+    f1 += f2 << 9;
+    f1 *= 3;
+    f0 += f1 << 9;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0;
+  }
+  /* XXX: using p mod 3 = 2 */
+  f0 = *f++ + qshift;
+  f1 = *f++ + qshift;
+  f0 = (21846 * f0) >> 16;
+  f1 = (21846 * f1) >> 16;
+  f1 *= 3;
+  f0 += f1 << 9;
+  *c++ = f0; f0 >>= 8;
+  *c++ = f0; f0 >>= 8;
+  *c++ = f0;
+}
+
+void rq_decoderounded(modq *f,const unsigned char *c)
+{
+  crypto_uint32 c0, c1, c2, c3;
+  crypto_uint32 f0, f1, f2;
+  int i;
+
+  for (i = 0;i < p/3;++i) {
+    c0 = *c++;
+    c1 = *c++;
+    c2 = *c++;
+    c3 = *c++;
+
+    /* f0 + f1*1536 + f2*1536^2 */
+    /* = c0 + c1*256 + c2*256^2 + c3*256^3 */
+    /* with each f between 0 and 1530 */
+
+    /* f2 = (64/9)c3 + (1/36)c2 + (1/9216)c1 + (1/2359296)c0 - [0,0.99675] */
+    /* claim: 2^21 f2 < x < 2^21(f2+1) */
+    /* where x = 14913081*c3 + 58254*c2 + 228*(c1+2) */
+    /* proof: x - 2^21 f2 = 456 - (8/9)c0 + (4/9)c1 - (2/9)c2 + (1/9)c3 + 2^21 [0,0.99675] */
+    /* at least 456 - (8/9)255 - (2/9)255 > 0 */
+    /* at most 456 + (4/9)255 + (1/9)255 + 2^21 0.99675 < 2^21 */
+    f2 = (14913081*c3 + 58254*c2 + 228*(c1+2)) >> 21;
+
+    c2 += c3 << 8;
+    c2 -= (f2 * 9) << 2;
+    /* f0 + f1*1536 */
+    /* = c0 + c1*256 + c2*256^2 */
+    /* c2 <= 35 = floor((1530+1530*1536)/256^2) */
+    /* f1 = (128/3)c2 + (1/6)c1 + (1/1536)c0 - (1/1536)f0 */
+    /* claim: 2^21 f1 < x < 2^21(f1+1) */
+    /* where x = 89478485*c2 + 349525*c1 + 1365*(c0+1) */
+    /* proof: x - 2^21 f1 = 1365 - (1/3)c2 - (1/3)c1 - (1/3)c0 + (4096/3)f0 */
+    /* at least 1365 - (1/3)35 - (1/3)255 - (1/3)255 > 0 */
+    /* at most 1365 + (4096/3)1530 < 2^21 */
+    f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
+
+    c1 += c2 << 8;
+    c1 -= (f1 * 3) << 1;
+
+    c0 += c1 << 8;
+    f0 = c0;
+
+    *f++ = modq_freeze(f0 * 3 + q - qshift);
+    *f++ = modq_freeze(f1 * 3 + q - qshift);
+    *f++ = modq_freeze(f2 * 3 + q - qshift);
+  }
+
+  c0 = *c++;
+  c1 = *c++;
+  c2 = *c++;
+
+  f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
+
+  c1 += c2 << 8;
+  c1 -= (f1 * 3) << 1;
+
+  c0 += c1 << 8;
+  f0 = c0;
+
+  *f++ = modq_freeze(f0 * 3 + q - qshift);
+  *f++ = modq_freeze(f1 * 3 + q - qshift);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.c
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.c
@ -0,0 +1,37 @@
+#include "params.h"
+#include "small.h"
+
+/* XXX: these functions rely on p mod 4 = 1 */
+
+/* all coefficients in -1, 0, 1 */
+void small_encode(unsigned char *c,const small *f)
+{
+  small c0;
+  int i;
+
+  for (i = 0;i < p/4;++i) {
+    c0 = *f++ + 1;
+    c0 += (*f++ + 1) << 2;
+    c0 += (*f++ + 1) << 4;
+    c0 += (*f++ + 1) << 6;
+    *c++ = c0;
+  }
+  c0 = *f++ + 1;
+  *c++ = c0;
+}
+
+void small_decode(small *f,const unsigned char *c)
+{
+  unsigned char c0;
+  int i;
+
+  for (i = 0;i < p/4;++i) {
+    c0 = *c++;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1;
+  }
+  c0 = *c++;
+  *f++ = ((small) (c0 & 3)) - 1;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.h
+++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.h
@ -0,0 +1,27 @@
+#ifndef small_h
+#define small_h
+
+#include "crypto_int8.h"
+#include "crypto_int32.h"
+
+typedef crypto_int8 small;
+
+#define small_encode crypto_kem_ntrulpr4591761_ref_small_encode
+extern void small_encode(unsigned char *,const small *);
+
+#define small_decode crypto_kem_ntrulpr4591761_ref_small_decode
+extern void small_decode(small *,const unsigned char *);
+
+#define small_random32 crypto_kem_ntrulpr4591761_ref_small_random32
+extern crypto_int32 small_random32(void);
+
+#define small_random crypto_kem_ntrulpr4591761_ref_small_random
+extern void small_random(small *);
+
+#define small_seeded_weightw crypto_kem_ntrulpr4591761_ref_small_seeded_weightw
+extern void small_seeded_weightw(small *,const unsigned char *);
+
+#define small_random_weightw crypto_kem_ntrulpr4591761_ref_small_random_weightw
+extern void small_random_weightw(small *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/api.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/api.h
@ -0,0 +1,4 @@
+#define CRYPTO_SECRETKEYBYTES 1600
+#define CRYPTO_PUBLICKEYBYTES 1218
+#define CRYPTO_CIPHERTEXTBYTES 1047
+#define CRYPTO_BYTES 32
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/dec.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/dec.c
@ -0,0 +1,67 @@
+#ifdef KAT
+#include <stdio.h>
+#endif
+
+#include "params.h"
+#include "small.h"
+#include "mod3.h"
+#include "rq.h"
+#include "r3.h"
+#include "crypto_hash_sha512.h"
+#include "crypto_verify_32.h"
+#include "crypto_kem.h"
+
+int crypto_kem_dec(
+  unsigned char *k,
+  const unsigned char *cstr,
+  const unsigned char *sk
+)
+{
+  small f[768];
+  modq h[768];
+  small grecip[768];
+  modq c[768];
+  modq t[768];
+  small t3[768];
+  small r[768];
+  modq hr[768];
+  unsigned char rstr[small_encode_len];
+  unsigned char hash[64];
+  int i;
+  int result = 0;
+
+  small_decode(f,sk);
+  small_decode(grecip,sk + small_encode_len);
+  rq_decode(h,sk + 2 * small_encode_len);
+
+  rq_decoderounded(c,cstr + 32);
+
+  rq_mult(t,c,f);
+  rq_mod3(t3,t);
+
+  r3_mult(r,t3,grecip);
+
+#ifdef KAT
+  {
+    int j;
+    printf("decrypt r:");
+    for (j = 0;j < p;++j)
+      if (r[j] == 1) printf(" +%d",j);
+      else if (r[j] == -1) printf(" -%d",j);
+    printf("\n");
+  }
+#endif
+
+  result |= r3_weightw_mask(r);
+
+  rq_mult(hr,h,r);
+  rq_round3(hr,hr);
+  for (i = 0;i < p;++i) result |= modq_nonzero_mask(hr[i] - c[i]);
+
+  small_encode(rstr,r);
+  crypto_hash_sha512(hash,rstr,sizeof rstr);
+  result |= crypto_verify_32(hash,cstr);
+
+  for (i = 0;i < 32;++i) k[i] = (hash[32 + i] & ~result);
+  return result;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/enc.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/enc.c
@ -0,0 +1,48 @@
+#ifdef KAT
+#include <stdio.h>
+#endif
+
+#include <string.h>
+#include "params.h"
+#include "small.h"
+#include "rq.h"
+#include "crypto_hash_sha512.h"
+#include "crypto_kem.h"
+
+int crypto_kem_enc(
+  unsigned char *cstr,
+  unsigned char *k,
+  const unsigned char *pk
+)
+{
+  small r[768];
+  modq h[768];
+  modq c[768];
+  unsigned char rstr[small_encode_len];
+  unsigned char hash[64];
+
+  small_random_weightw(r);
+
+#ifdef KAT
+  {
+    int i;
+    printf("encrypt r:");
+    for (i = 0;i < p;++i)
+      if (r[i] == 1) printf(" +%d",i);
+      else if (r[i] == -1) printf(" -%d",i);
+    printf("\n");
+  }
+#endif
+
+  small_encode(rstr,r);
+  crypto_hash_sha512(hash,rstr,sizeof rstr);
+
+  rq_decode(h,pk);
+  rq_mult(c,h,r);
+
+  memcpy(k,hash + 32,32);
+  memcpy(cstr,hash,32);
+  rq_roundencode(cstr + 32,c);
+
+  return 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/implementors
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/implementors
@ -0,0 +1,5 @@
+Alphabetical order:
+Daniel J. Bernstein
+Chitchanok Chuengsatiansup
+Tanja Lange
+Christine van Vredendaal
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.c
@ -0,0 +1,425 @@
+#include "int32_sort.h"
+#include <immintrin.h>
+
+typedef crypto_int32 int32;
+
+static inline void minmax(int32 *x,int32 *y)
+{
+  asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg %%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
+    : : "r"(x),"r"(y) : "%eax","%ebx","%edx");
+}
+
+/* sort x0,x2; sort x1,x3; ... sort x13, x15 */
+static inline void minmax02through1315(int32 *x)
+{
+  __m256i a = _mm256_loadu_si256((__m256i *) x);
+  __m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
+  __m256i c = _mm256_unpacklo_epi64(a,b); /* a01b01a45b45 */
+  __m256i d = _mm256_unpackhi_epi64(a,b); /* a23b23a67b67 */
+  __m256i g = _mm256_min_epi32(c,d);
+  __m256i h = _mm256_max_epi32(c,d);
+  a = _mm256_unpacklo_epi64(g,h);
+  b = _mm256_unpackhi_epi64(g,h);
+  _mm256_storeu_si256((__m256i *) x,a);
+  _mm256_storeu_si256((__m256i *) (x + 8),b);
+}
+
+/* sort x0,x2; sort x1,x3; sort x4,x6; sort x5,x7 */
+static inline void minmax02134657(int32 *x)
+{
+  __m256i a = _mm256_loadu_si256((__m256i *) x);
+  __m256i b = _mm256_shuffle_epi32(a,0x4e);
+  __m256i c = _mm256_cmpgt_epi32(a,b);
+  c = _mm256_shuffle_epi32(c,0x44);
+  __m256i abc = c & (a ^ b);
+  a ^= abc;
+  _mm256_storeu_si256((__m256i *) x,a);
+}
+
+static void multiminmax2plus2(
+  int32 *x,
+  int n)
+{
+  while (n >= 16) {
+    minmax02through1315(x);
+    n -= 16;
+    x += 16;
+  }
+  if (n >= 8) {
+    minmax02134657(x);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + 2);
+    minmax(x + 1,x + 3);
+    n -= 4;
+    x += 4;
+  }
+  if (n > 0) {
+    minmax(x,x + 2);
+    if (n > 1) minmax(x + 1,x + 3);
+  }
+}
+
+static void multiminmax2plus6(
+  int32 *x,
+  int n)
+{
+  while (n >= 4) {
+    minmax(x,x + 6);
+    minmax(x + 1,x + 7);
+    n -= 4;
+    x += 4;
+  }
+  if (n > 0) {
+    minmax(x,x + 6);
+    if (n > 1) minmax(x + 1,x + 7);
+  }
+}
+
+static void multiminmax2plus14(
+  int32 *x,
+  int n)
+{
+  while (n >= 8) {
+    minmax(x,x + 14);
+    minmax(x + 1,x + 15);
+    minmax(x + 4,x + 18);
+    minmax(x + 5,x + 19);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + 14);
+    minmax(x + 1,x + 15);
+    n -= 4;
+    x += 4;
+  }
+  if (n > 0) {
+    minmax(x,x + 14);
+    if (n > 1) minmax(x + 1,x + 15);
+  }
+}
+
+/* sort x[i],y[i] for i in 0,1,4,5,8,9,12,13 */
+/* all of x0...x15 and y0...y15 must exist; no aliasing */
+static inline void minmax0145891213(int32 *x,int32 *y)
+{
+  __m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
+  __m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
+  __m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
+  __m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
+
+  __m256i a0189451213 = _mm256_unpacklo_epi64(a01234567,a89101112131415);
+  __m256i b0189451213 = _mm256_unpacklo_epi64(b01234567,b89101112131415);
+  __m256i c0189451213 = _mm256_min_epi32(a0189451213,b0189451213);
+  __m256i d0189451213 = _mm256_max_epi32(a0189451213,b0189451213);
+
+  __m256i c01234567 = _mm256_blend_epi32(a01234567,c0189451213,0x33);
+  __m256i d01234567 = _mm256_blend_epi32(b01234567,d0189451213,0x33);
+  __m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213,a89101112131415);
+  __m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213,b89101112131415);
+
+  _mm256_storeu_si256((__m256i *) x,c01234567);
+  _mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
+  _mm256_storeu_si256((__m256i *) y,d01234567);
+  _mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
+}
+
+/* offset >= 30 */
+static void multiminmax2plusmore(
+  int32 *x,
+  int n,
+  int offset)
+{
+  while (n >= 16) {
+    minmax0145891213(x,x + offset);
+    n -= 16;
+    x += 16;
+  }
+  if (n >= 8) {
+    minmax(x,x + offset);
+    minmax(x + 1,x + 1 + offset);
+    minmax(x + 4,x + 4 + offset);
+    minmax(x + 5,x + 5 + offset);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + offset);
+    minmax(x + 1,x + 1 + offset);
+    n -= 4;
+    x += 4;
+  }
+  if (n > 0) {
+    minmax(x,x + offset);
+    if (n > 1) minmax(x + 1,x + 1 + offset);
+  }
+}
+
+/* sort x0,x1; ... sort x14, x15 */
+static inline void minmax01through1415(int32 *x)
+{
+  __m256i a = _mm256_loadu_si256((__m256i *) x);
+  __m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
+  __m256i c = _mm256_unpacklo_epi32(a,b); /* ab0ab1ab4ab5 */
+  __m256i d = _mm256_unpackhi_epi32(a,b); /* ab2ab3ab6ab7 */
+  __m256i e = _mm256_unpacklo_epi32(c,d); /* a02b02a46b46 */
+  __m256i f = _mm256_unpackhi_epi32(c,d); /* a13b13a57b57 */
+  __m256i g = _mm256_min_epi32(e,f); /* a02b02a46b46 */
+  __m256i h = _mm256_max_epi32(e,f); /* a13b13a57b57 */
+  a = _mm256_unpacklo_epi32(g,h);
+  b = _mm256_unpackhi_epi32(g,h);
+  _mm256_storeu_si256((__m256i *) x,a);
+  _mm256_storeu_si256((__m256i *) (x + 8),b);
+}
+
+/* sort x0,x1; sort x2,x3; sort x4,x5; sort x6,x7 */
+static inline void minmax01234567(int32 *x)
+{
+  __m256i a = _mm256_loadu_si256((__m256i *) x);
+  __m256i b = _mm256_shuffle_epi32(a,0xb1);
+  __m256i c = _mm256_cmpgt_epi32(a,b);
+  c = _mm256_shuffle_epi32(c,0xa0);
+  __m256i abc = c & (a ^ b);
+  a ^= abc;
+  _mm256_storeu_si256((__m256i *) x,a);
+}
+
+static void multiminmax1plus1(
+  int32 *x,
+  int n)
+{
+  while (n >= 16) {
+    minmax01through1415(x);
+    n -= 16;
+    x += 16;
+  }
+  if (n >= 8) {
+    minmax01234567(x);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + 1);
+    minmax(x + 2,x + 3);
+    n -= 4;
+    x += 4;
+  }
+  if (n >= 2) {
+    minmax(x,x + 1);
+    n -= 2;
+    x += 2;
+  }
+  if (n > 0)
+    minmax(x,x + 1);
+}
+
+static void multiminmax1(
+  int32 *x,
+  int n,
+  int offset)
+{
+  while (n >= 16) {
+    minmax(x,x + offset);
+    minmax(x + 2,x + 2 + offset);
+    minmax(x + 4,x + 4 + offset);
+    minmax(x + 6,x + 6 + offset);
+    minmax(x + 8,x + 8 + offset);
+    minmax(x + 10,x + 10 + offset);
+    minmax(x + 12,x + 12 + offset);
+    minmax(x + 14,x + 14 + offset);
+    n -= 16;
+    x += 16;
+  }
+  if (n >= 8) {
+    minmax(x,x + offset);
+    minmax(x + 2,x + 2 + offset);
+    minmax(x + 4,x + 4 + offset);
+    minmax(x + 6,x + 6 + offset);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + offset);
+    minmax(x + 2,x + 2 + offset);
+    n -= 4;
+    x += 4;
+  }
+  if (n >= 2) {
+    minmax(x,x + offset);
+    n -= 2;
+    x += 2;
+  }
+  if (n > 0)
+    minmax(x,x + offset);
+}
+
+/* sort x[i],y[i] for i in 0,2,4,6,8,10,12,14 */
+/* all of x0...x15 and y0...y15 must exist; no aliasing */
+static inline void minmax02468101214(int32 *x,int32 *y)
+{
+  __m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
+  __m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
+  __m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
+  __m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
+
+  __m256i a0819412513 = _mm256_unpacklo_epi32(a01234567,a89101112131415);
+  __m256i a210311614715 = _mm256_unpackhi_epi32(a01234567,a89101112131415);
+  __m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513,a210311614715);
+  __m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513,a210311614715);
+
+  __m256i b0819412513 = _mm256_unpacklo_epi32(b01234567,b89101112131415);
+  __m256i b210311614715 = _mm256_unpackhi_epi32(b01234567,b89101112131415);
+  __m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513,b210311614715);
+  __m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513,b210311614715);
+
+  __m256i c02810461214 = _mm256_min_epi32(a02810461214,b02810461214);
+  __m256i d02810461214 = _mm256_max_epi32(a02810461214,b02810461214);
+
+  __m256i c01234567 = _mm256_unpacklo_epi32(c02810461214,a13911571315);
+  __m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214,a13911571315);
+  __m256i d01234567 = _mm256_unpacklo_epi32(d02810461214,b13911571315);
+  __m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214,b13911571315);
+  
+  _mm256_storeu_si256((__m256i *) x,c01234567);
+  _mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
+  _mm256_storeu_si256((__m256i *) y,d01234567);
+  _mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
+}
+
+/* assumes offset >= 31 */
+static void multiminmax1plusmore(
+  int32 *x,
+  int n,
+  int offset)
+{
+  while (n >= 16) {
+    minmax02468101214(x,x + offset);
+    n -= 16;
+    x += 16;
+  }
+  if (n >= 8) {
+    minmax(x,x + offset);
+    minmax(x + 2,x + 2 + offset);
+    minmax(x + 4,x + 4 + offset);
+    minmax(x + 6,x + 6 + offset);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4) {
+    minmax(x,x + offset);
+    minmax(x + 2,x + 2 + offset);
+    n -= 4;
+    x += 4;
+  }
+  if (n >= 2) {
+    minmax(x,x + offset);
+    n -= 2;
+    x += 2;
+  }
+  if (n > 0)
+    minmax(x,x + offset);
+}
+
+/* sort x0,y0; sort x1,y1; ...; sort x7,y7 */
+static inline void minmax8(int32 *x,int32 *y)
+{
+  __m256i a = _mm256_loadu_si256((__m256i *) x);
+  __m256i b = _mm256_loadu_si256((__m256i *) y);
+  _mm256_storeu_si256((__m256i *) x,_mm256_min_epi32(a,b));
+  _mm256_storeu_si256((__m256i *) y,_mm256_max_epi32(a,b));
+}
+
+/* assumes p >= 8; implies offset >= 8 */
+static void multiminmax_atleast8(int p,
+  int32 *x,
+  int n,
+  int offset)
+{
+  int i;
+  while (n >= 2 * p) {
+    for (i = 0;i < p;i += 8)
+      minmax8(x + i,x + i + offset);
+    n -= 2 * p;
+    x += 2 * p;
+  }
+  for (i = 0;i + 8 <= n;i += 8) {
+    if (i & p) return;
+    minmax8(x + i,x + i + offset);
+  }
+  for (;i < n;++i) {
+    if (i & p) return;
+    minmax(x + i,x + i + offset);
+  }
+}
+
+/* sort x0,y0; sort x1,y1; sort x2,y2; sort x3,y3 */
+static inline void minmax4(int32 *x,int32 *y)
+{
+  __m128i a = _mm_loadu_si128((__m128i *) x);
+  __m128i b = _mm_loadu_si128((__m128i *) y);
+  _mm_storeu_si128((__m128i *) x,_mm_min_epi32(a,b));
+  _mm_storeu_si128((__m128i *) y,_mm_max_epi32(a,b));
+}
+
+static void multiminmax4(
+  int32 *x,
+  int n,
+  int offset)
+{
+  int i;
+  while (n >= 8) {
+    minmax4(x,x + offset);
+    n -= 8;
+    x += 8;
+  }
+  if (n >= 4)
+    minmax4(x,x + offset);
+  else
+    for (i = 0;i < n;++i)
+      minmax(x + i,x + i + offset);
+}
+
+void int32_sort(int32 *x,int n)
+{
+  int top,p,q;
+
+  if (n < 2) return;
+  top = 1;
+  while (top < n - top) top += top;
+
+  for (p = top;p >= 8;p >>= 1) {
+    multiminmax_atleast8(p,x,n - p,p);
+    for (q = top;q > p;q >>= 1)
+      multiminmax_atleast8(p,x + p,n - q,q - p);
+  }
+  if (p >= 4) {
+    multiminmax4(x,n - 4,4);
+    for (q = top;q > 4;q >>= 1)
+      multiminmax4(x + 4,n - q,q - 4);
+  }
+  if (p >= 2) {
+    multiminmax2plus2(x,n - 2);
+    for (q = top;q >= 32;q >>= 1)
+      multiminmax2plusmore(x + 2,n - q,q - 2);
+    if (q >= 16)
+      multiminmax2plus14(x + 2,n - 16);
+    if (q >= 8)
+      multiminmax2plus6(x + 2,n - 8);
+    if (q >= 4)
+      multiminmax2plus2(x + 2,n - 4);
+  }
+  multiminmax1plus1(x,n - 1);
+  for (q = top;q >= 32;q >>= 1)
+    multiminmax1plusmore(x + 1,n - q,q - 1);
+  if (q >= 16)
+    multiminmax1(x + 1,n - 16,15);
+  if (q >= 8)
+    multiminmax1(x + 1,n - 8,7);
+  if (q >= 4)
+    multiminmax1(x + 1,n - 4,3);
+  if (q >= 2)
+    multiminmax1plus1(x + 1,n - 2);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.h
@ -0,0 +1,9 @@
+#ifndef int32_sort_h
+#define int32_sort_h
+
+#include "crypto_int32.h"
+
+#define int32_sort crypto_kem_sntrup4591761_avx_int32_sort
+extern void int32_sort(crypto_int32 *,int);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/keypair.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/keypair.c
@ -0,0 +1,39 @@
+#include <string.h>
+#include "modq.h"
+#include "params.h"
+#include "r3.h"
+#include "small.h"
+#include "rq.h"
+#include "crypto_kem.h"
+
+#if crypto_kem_PUBLICKEYBYTES != rq_encode_len
+#error "crypto_kem_PUBLICKEYBYTES must match rq_encode_len"
+#endif
+#if crypto_kem_SECRETKEYBYTES != rq_encode_len + 2 * small_encode_len
+#error "crypto_kem_SECRETKEYBYTES must match rq_encode_len + 2 * small_encode_len"
+#endif
+
+int crypto_kem_keypair(unsigned char *pk,unsigned char *sk)
+{
+  small g[768];
+  small grecip[768];
+  small f[768];
+  modq f3recip[768];
+  modq h[768];
+
+  do
+    small_random(g);
+  while (r3_recip(grecip,g) != 0);
+
+  small_random_weightw(f);
+  rq_recip3(f3recip,f);
+
+  rq_mult(h,f3recip,g);
+
+  rq_encode(pk,h);
+  small_encode(sk,f);
+  small_encode(sk + small_encode_len,grecip);
+  memcpy(sk + 2 * small_encode_len,pk,rq_encode_len);
+
+  return 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mod3.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mod3.h
@ -0,0 +1,60 @@
+#ifndef mod3_h
+#define mod3_h
+
+#include "small.h"
+#include "crypto_int32.h"
+
+/* -1 if x is nonzero, 0 otherwise */
+static inline int mod3_nonzero_mask(small x)
+{
+  return -x*x;
+}
+
+/* input between -100000 and 100000 */
+/* output between -1 and 1 */
+static inline small mod3_freeze(crypto_int32 a)
+{
+  a -= 3 * ((10923 * a) >> 15);
+  a -= 3 * ((89478485 * a + 134217728) >> 28);
+  return a;
+}
+
+static inline small mod3_minusproduct(small a,small b,small c)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  crypto_int32 C = c;
+  return mod3_freeze(A - B * C);
+}
+
+static inline small mod3_plusproduct(small a,small b,small c)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  crypto_int32 C = c;
+  return mod3_freeze(A + B * C);
+}
+
+static inline small mod3_product(small a,small b)
+{
+  return a * b;
+}
+
+static inline small mod3_sum(small a,small b)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  return mod3_freeze(A + B);
+}
+
+static inline small mod3_reciprocal(small a1)
+{
+  return a1;
+}
+
+static inline small mod3_quotient(small num,small den)
+{
+  return mod3_product(num,mod3_reciprocal(den));
+}
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/modq.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/modq.h
@ -0,0 +1,91 @@
+#ifndef modq_h
+#define modq_h
+
+#include "crypto_int16.h"
+#include "crypto_int32.h"
+#include "crypto_uint16.h"
+
+typedef crypto_int16 modq;
+
+/* input between -9000000 and 9000000 */
+/* output between -2295 and 2295 */
+static inline modq modq_freeze(crypto_int32 a)
+{
+  a -= 4591 * ((228 * a) >> 20);
+  a -= 4591 * ((58470 * a + 134217728) >> 28);
+  return a;
+}
+
+static inline modq modq_minusproduct(modq a,modq b,modq c)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  crypto_int32 C = c;
+  return modq_freeze(A - B * C);
+}
+
+static inline modq modq_plusproduct(modq a,modq b,modq c)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  crypto_int32 C = c;
+  return modq_freeze(A + B * C);
+}
+
+static inline modq modq_product(modq a,modq b)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  return modq_freeze(A * B);
+}
+
+static inline modq modq_square(modq a)
+{
+  crypto_int32 A = a;
+  return modq_freeze(A * A);
+}
+
+static inline modq modq_sum(modq a,modq b)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  return modq_freeze(A + B);
+}
+
+static inline modq modq_reciprocal(modq a1)
+{
+  modq a2 = modq_square(a1);
+  modq a3 = modq_product(a2,a1);
+  modq a4 = modq_square(a2);
+  modq a8 = modq_square(a4);
+  modq a16 = modq_square(a8);
+  modq a32 = modq_square(a16);
+  modq a35 = modq_product(a32,a3);
+  modq a70 = modq_square(a35);
+  modq a140 = modq_square(a70);
+  modq a143 = modq_product(a140,a3);
+  modq a286 = modq_square(a143);
+  modq a572 = modq_square(a286);
+  modq a1144 = modq_square(a572);
+  modq a1147 = modq_product(a1144,a3);
+  modq a2294 = modq_square(a1147);
+  modq a4588 = modq_square(a2294);
+  modq a4589 = modq_product(a4588,a1);
+  return a4589;
+}
+
+static inline modq modq_quotient(modq num,modq den)
+{
+  return modq_product(num,modq_reciprocal(den));
+}
+
+/* -1 if x is nonzero, 0 otherwise */
+static inline int modq_nonzero_mask(modq x)
+{
+  crypto_int32 r = (crypto_uint16) x;
+  r = -r;
+  r >>= 30;
+  return r;
+}
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mult.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mult.c
@ -0,0 +1,762 @@
+#include <string.h>
+#include <immintrin.h>
+#include "mod3.h"
+#include "rq.h"
+#include "r3.h"
+
+#define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  h0 += f0 * gj; \
+  _mm256_storeu_ps(&h[i + j],h0); \
+  h1 += f1 * gj; \
+  h2 += f2 * gj; \
+  h3 += f3 * gj; \
+  h4 += f4 * gj; \
+  h0 = _mm256_loadu_ps(&h[i + j + 5]); \
+  h0 += f5 * gj;
+
+#define MULSTEP_asm(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  __asm__( \
+    "vfmadd231ps %5,%6,%0 \n\t" \
+    "vmovups %0,%12 \n\t" \
+    "vmovups %13,%0 \n\t" \
+    "vfmadd231ps %5,%7,%1 \n\t" \
+    "vfmadd231ps %5,%8,%2 \n\t" \
+    "vfmadd231ps %5,%9,%3 \n\t" \
+    "vfmadd231ps %5,%10,%4 \n\t" \
+    "vfmadd231ps %5,%11,%0 \n\t" \
+    : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
+    : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]),"m"(h[i+j+5]));
+
+#define MULSTEP MULSTEP_asm
+
+#define MULSTEP_noload(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  __asm__( \
+    "vfmadd231ps %5,%6,%0 \n\t" \
+    "vmovups %0,%12 \n\t" \
+    "vfmadd231ps %5,%7,%1 \n\t" \
+    "vfmadd231ps %5,%8,%2 \n\t" \
+    "vfmadd231ps %5,%9,%3 \n\t" \
+    "vfmadd231ps %5,%10,%4 \n\t" \
+    "vmulps %5,%11,%0 \n\t" \
+    : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
+    : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
+
+#define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \
+  gj = g[j]; \
+  __asm__( \
+    "vmulps %5,%6,%0 \n\t" \
+    "vmovups %0,%12 \n\t" \
+    "vmulps %5,%7,%1 \n\t" \
+    "vmulps %5,%8,%2 \n\t" \
+    "vmulps %5,%9,%3 \n\t" \
+    "vmulps %5,%10,%4 \n\t" \
+    "vmulps %5,%11,%0 \n\t" \
+    : "=&x"(h0),"=&x"(h1),"=&x"(h2),"=&x"(h3),"=&x"(h4) \
+    : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
+
+static inline __m128i _mm_load_cvtepi8_epi16(const long long *x)
+{
+  __m128i result;
+  __asm__("vpmovsxbw %1, %0" : "=x"(result) : "m"(*x));
+  return result;
+}
+
+#define v0 _mm256_set1_epi32(0)
+#define v0_128 _mm_set1_epi32(0)
+#define v7 _mm256_set1_epi16(7)
+#define v4591_16 _mm256_set1_epi16(4591)
+#define v2296_16 _mm256_set1_epi16(2296)
+
+#define alpha_32 _mm256_set1_epi32(0x4b400000)
+#define alpha_32_128 _mm_set1_epi32(0x4b400000)
+#define alpha_float _mm256_set1_ps(12582912.0)
+
+#define v0_float _mm256_set1_ps(0)
+#define v1_float _mm256_set1_ps(1)
+#define vm1_float _mm256_set1_ps(-1)
+#define vm4591_float _mm256_set1_ps(-4591)
+#define recip4591_float _mm256_set1_ps(0.00021781746896101067305597908952297974298)
+
+static inline __m256 add(__m256 x,__m256 y)
+{
+  return x + y;
+}
+
+static inline __m256 fastadd(__m256 x,__m256 y)
+{
+  return _mm256_fmadd_ps(y,v1_float,x);
+}
+
+static inline __m256 fastsub(__m256 x,__m256 y)
+{
+  return _mm256_fmadd_ps(y,vm1_float,x);
+}
+
+static inline __m256 reduce(__m256 x)
+{
+  __m256 q = x * recip4591_float;
+  q = _mm256_round_ps(q,8);
+  return _mm256_fmadd_ps(q,vm4591_float,x);
+}
+
+static inline __m256i squeeze(__m256i x)
+{
+  __m256i q = _mm256_mulhrs_epi16(x,v7);
+  q = _mm256_mullo_epi16(q,v4591_16);
+  return _mm256_sub_epi16(x,q);
+}
+
+static inline __m256i squeezeadd16(__m256i x,__m256i y)
+{
+  __m256i q;
+  x = _mm256_add_epi16(x,y);
+  q = _mm256_mulhrs_epi16(x,v7);
+  q = _mm256_mullo_epi16(q,v4591_16);
+  return _mm256_sub_epi16(x,q);
+}
+
+static inline __m256i freeze(__m256i x)
+{
+  __m256i mask, x2296, x4591;
+  x4591 = _mm256_add_epi16(x,v4591_16);
+  mask = _mm256_srai_epi16(x,15);
+  x = _mm256_blendv_epi8(x,x4591,mask);
+  x2296 = _mm256_sub_epi16(x,v2296_16);
+  mask = _mm256_srai_epi16(x2296,15);
+  x4591 = _mm256_sub_epi16(x,v4591_16);
+  x = _mm256_blendv_epi8(x4591,x,mask);
+  return x;
+}
+
+/* 24*8*float32 f inputs between -10000 and 10000 */
+/* 24*8*float32 g inputs between -32 and 32 */
+/* 48*8*float32 h outputs between -7680000 and 7680000 */
+static void mult24x8_float(__m256 h[48],const __m256 f[24],const __m256 g[24])
+{
+  int i, j;
+  __m256 f0, f1, f2, f3, f4, f5, gj, h0, h1, h2, h3, h4;
+
+  i = 0;
+  f0 = f[i];
+  f1 = f[i + 1];
+  f2 = f[i + 2];
+  f3 = f[i + 3];
+  f4 = f[i + 4];
+  f5 = f[i + 5];
+  MULSTEP_fromzero(0,h0,h1,h2,h3,h4)
+  for (j = 0;j < 20;j += 5) {
+    MULSTEP_noload(j + 1,h1,h2,h3,h4,h0)
+    MULSTEP_noload(j + 2,h2,h3,h4,h0,h1)
+    MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
+    MULSTEP_noload(j + 4,h4,h0,h1,h2,h3)
+    MULSTEP_noload(j + 5,h0,h1,h2,h3,h4)
+  }
+  MULSTEP_noload(j + 1,h1,h2,h3,h4,h0)
+  MULSTEP_noload(j + 2,h2,h3,h4,h0,h1)
+  MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
+  h[i + j + 4] = h4;
+  h[i + j + 5] = h0;
+  h[i + j + 6] = h1;
+  h[i + j + 7] = h2;
+  h[i + j + 8] = h3;
+
+  for (i = 6;i < 24;i += 6) {
+    f0 = f[i];
+    f1 = f[i + 1];
+    f2 = f[i + 2];
+    f3 = f[i + 3];
+    f4 = f[i + 4];
+    f5 = f[i + 5];
+    h0 = h[i];
+    h1 = h[i + 1];
+    h2 = h[i + 2];
+    h3 = h[i + 3];
+    h4 = h[i + 4];
+    for (j = 0;j < 15;j += 5) {
+      MULSTEP(j + 0,h0,h1,h2,h3,h4)
+      MULSTEP(j + 1,h1,h2,h3,h4,h0)
+      MULSTEP(j + 2,h2,h3,h4,h0,h1)
+      MULSTEP(j + 3,h3,h4,h0,h1,h2)
+      MULSTEP(j + 4,h4,h0,h1,h2,h3)
+    }
+    MULSTEP(j + 0,h0,h1,h2,h3,h4)
+    MULSTEP(j + 1,h1,h2,h3,h4,h0)
+    MULSTEP(j + 2,h2,h3,h4,h0,h1)
+    MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
+    MULSTEP_noload(j + 4,h4,h0,h1,h2,h3)
+    MULSTEP_noload(j + 5,h0,h1,h2,h3,h4)
+    MULSTEP_noload(j + 6,h1,h2,h3,h4,h0)
+    MULSTEP_noload(j + 7,h2,h3,h4,h0,h1)
+    MULSTEP_noload(j + 8,h3,h4,h0,h1,h2)
+    h[i + j + 9] = h4;
+    h[i + j + 10] = h0;
+    h[i + j + 11] = h1;
+    h[i + j + 12] = h2;
+    h[i + j + 13] = h3;
+  }
+
+  h[47] = v0_float;
+}
+
+/* 48*8*float32 f inputs between -5000 and 5000 */
+/* 48*8*float32 g inputs between -16 and 16 */
+/* 96*8*float32 h outputs between -3840000 and 3840000 */
+static void mult48x8_float(__m256 h[96],const __m256 f[48],const __m256 g[48])
+{
+  __m256 h01[48];
+  __m256 g01[24];
+  __m256 *f01 = h01 + 24;
+  int i;
+
+  for (i = 24;i > 0;) {
+    i -= 2;
+    f01[i] = f[i] + f[i + 24];
+    g01[i] = g[i] + g[i + 24];
+    f01[i + 1] = f[i + 1] + f[i + 1 + 24];
+    g01[i + 1] = g[i + 1] + g[i + 1 + 24];
+  }
+
+  mult24x8_float(h,f,g);
+  mult24x8_float(h + 48,f + 24,g + 24);
+  mult24x8_float(h01,f01,g01);
+
+  for (i = 0;i < 24;++i) {
+    __m256 h0i = h[i];
+    __m256 h0itop = h[i + 24];
+    __m256 h1i = h[i + 48];
+    __m256 h1itop = h[i + 72];
+    __m256 h01i = h01[i];
+    __m256 h01itop = h01[i + 24];
+    __m256 c = fastsub(h0itop,h1i);
+    h[i + 24] = c + fastsub(h01i,h0i);
+    h[i + 48] = fastsub(h01itop,h1itop) - c;
+  }
+}
+
+/* 96*8*float32 f inputs between -2500 and 2500 */
+/* 96*8*float32 g inputs between -8 and 8 */
+/* 192*8*float32 h outputs between -1920000 and 1920000 */
+static void mult96x8_float(__m256 h[192],const __m256 f[96],const __m256 g[96])
+{
+  __m256 h01[96];
+  __m256 g01[48];
+  __m256 *f01 = h01 + 48;
+  int i;
+
+  for (i = 48;i > 0;) {
+    i -= 4;
+    f01[i] = f[i] + f[i + 48];
+    g01[i] = g[i] + g[i + 48];
+    f01[i + 1] = f[i + 1] + f[i + 1 + 48];
+    g01[i + 1] = g[i + 1] + g[i + 1 + 48];
+    f01[i + 2] = f[i + 2] + f[i + 2 + 48];
+    g01[i + 2] = g[i + 2] + g[i + 2 + 48];
+    f01[i + 3] = f[i + 3] + f[i + 3 + 48];
+    g01[i + 3] = g[i + 3] + g[i + 3 + 48];
+  }
+
+  mult48x8_float(h,f,g);
+  mult48x8_float(h + 96,f + 48,g + 48);
+  mult48x8_float(h01,f01,g01);
+
+  for (i = 0;i < 48;++i) {
+    __m256 h0i = h[i];
+    __m256 h0itop = h[i + 48];
+    __m256 h1i = h[i + 96];
+    __m256 h1itop = h[i + 144];
+    __m256 h01i = h01[i];
+    __m256 h01itop = h01[i + 48];
+    __m256 c = fastsub(h0itop,h1i);
+    h[i + 48] = c + fastsub(h01i,h0i);
+    h[i + 96] = fastsub(h01itop,h1itop) - c;
+  }
+}
+
+/* 96*16*int16 f inputs between -2500 and 2500 */
+/* 96*(16*int8 stored in 32*int8) g inputs between -8 and 8 */
+/* 192*16*int16 h outputs between -2400 and 2400 */
+static void mult96x16(__m256i h[192],const __m256i f[96],const __m256i g[96])
+{
+  __m256 hfloat[192];
+  __m256 gfloat[96];
+  __m256 *ffloat = hfloat + 96;
+  int i, p;
+
+  for (p = 0;p < 2;++p) {
+    for (i = 96;i > 0;) {
+      i -= 2;
+      __m256i fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i]));
+      __m256i gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i]));
+      __m256 storage;
+      *(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32);
+      ffloat[i] = storage - alpha_float;
+      *(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32);
+      gfloat[i] = storage - alpha_float;
+      fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i + 1]));
+      gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i + 1]));
+      *(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32);
+      ffloat[i + 1] = storage - alpha_float;
+      *(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32);
+      gfloat[i + 1] = storage - alpha_float;
+    }
+    mult96x8_float(hfloat,ffloat,gfloat);
+    for (i = 192;i > 0;) {
+      __m128i h0, h1;
+      i -= 4;
+      hfloat[i] = add(alpha_float,reduce(hfloat[i]));
+      hfloat[i + 1] = fastadd(alpha_float,reduce(hfloat[i + 1]));
+      hfloat[i + 2] = add(alpha_float,reduce(hfloat[i + 2]));
+      hfloat[i + 3] = fastadd(alpha_float,reduce(hfloat[i + 3]));
+      h0 = 0[(__m128i *) &hfloat[i]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
+      h1 = 1[(__m128i *) &hfloat[i]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
+      _mm_storeu_si128(p + (__m128i *) &h[i],_mm_packs_epi32(h0,h1));
+      h0 = 0[(__m128i *) &hfloat[i + 1]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
+      h1 = 1[(__m128i *) &hfloat[i + 1]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
+      _mm_storeu_si128(p + (__m128i *) &h[i + 1],_mm_packs_epi32(h0,h1));
+      h0 = 0[(__m128i *) &hfloat[i + 2]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
+      h1 = 1[(__m128i *) &hfloat[i + 2]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
+      _mm_storeu_si128(p + (__m128i *) &h[i + 2],_mm_packs_epi32(h0,h1));
+      h0 = 0[(__m128i *) &hfloat[i + 3]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
+      h1 = 1[(__m128i *) &hfloat[i + 3]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
+      _mm_storeu_si128(p + (__m128i *) &h[i + 3],_mm_packs_epi32(h0,h1));
+    }
+  }
+}
+
+/* int16 i of output x[j] is int16 j of input x[i] */
+static void transpose16(__m256i x[16])
+{
+  const static int rev[4] = {0,4,2,6};
+  int i;
+  __m256i y[16];
+
+  for (i = 0;i < 16;i += 4) {
+    __m256i a0 = x[i]; 
+    __m256i a1 = x[i + 1];
+    __m256i a2 = x[i + 2]; 
+    __m256i a3 = x[i + 3];
+    __m256i b0 = _mm256_unpacklo_epi16(a0,a1);
+    __m256i b1 = _mm256_unpackhi_epi16(a0,a1);
+    __m256i b2 = _mm256_unpacklo_epi16(a2,a3);
+    __m256i b3 = _mm256_unpackhi_epi16(a2,a3);
+    __m256i c0 = _mm256_unpacklo_epi32(b0,b2);
+    __m256i c2 = _mm256_unpackhi_epi32(b0,b2);
+    __m256i c1 = _mm256_unpacklo_epi32(b1,b3);
+    __m256i c3 = _mm256_unpackhi_epi32(b1,b3);
+    y[i] = c0;
+    y[i + 2] = c2;
+    y[i + 1] = c1;
+    y[i + 3] = c3;
+  }
+  for (i = 0;i < 4;++i) {
+    int r = rev[i];
+    __m256i c0 = y[i];
+    __m256i c4 = y[i + 4];
+    __m256i c8 = y[i + 8];
+    __m256i c12 = y[i + 12];
+    __m256i d0 = _mm256_unpacklo_epi64(c0,c4);
+    __m256i d4 = _mm256_unpackhi_epi64(c0,c4);
+    __m256i d8 = _mm256_unpacklo_epi64(c8,c12);
+    __m256i d12 = _mm256_unpackhi_epi64(c8,c12);
+    __m256i e0 = _mm256_permute2x128_si256(d0,d8,0x20);
+    __m256i e8 = _mm256_permute2x128_si256(d0,d8,0x31);
+    __m256i e4 = _mm256_permute2x128_si256(d4,d12,0x20);
+    __m256i e12 = _mm256_permute2x128_si256(d4,d12,0x31);
+    x[r] = e0;
+    x[r + 8] = e8;
+    x[r + 1] = e4;
+    x[r + 9] = e12;
+  }
+}
+
+/* byte i of output x[j] is byte j of input x[i] */
+static void transpose32(__m256i x[32])
+{
+  const static int rev[4] = {0,8,4,12};
+  int i;
+  __m256i y[32];
+
+  for (i = 0;i < 32;i += 4) {
+    __m256i a0 = x[i]; 
+    __m256i a1 = x[i + 1];
+    __m256i a2 = x[i + 2];
+    __m256i a3 = x[i + 3];
+    __m256i b0 = _mm256_unpacklo_epi8(a0,a1);
+    __m256i b1 = _mm256_unpackhi_epi8(a0,a1);
+    __m256i b2 = _mm256_unpacklo_epi8(a2,a3);
+    __m256i b3 = _mm256_unpackhi_epi8(a2,a3);
+    __m256i c0 = _mm256_unpacklo_epi16(b0,b2);
+    __m256i c2 = _mm256_unpackhi_epi16(b0,b2);
+    __m256i c1 = _mm256_unpacklo_epi16(b1,b3);
+    __m256i c3 = _mm256_unpackhi_epi16(b1,b3);
+    y[i] = c0;
+    y[i + 2] = c2;
+    y[i + 1] = c1;
+    y[i + 3] = c3;
+  }
+  for (i = 0;i < 4;++i) {
+    int r = rev[i];
+    __m256i c0 = y[i];
+    __m256i c8 = y[i + 8];
+    __m256i c16 = y[i + 16];
+    __m256i c24 = y[i + 24];
+    __m256i c4 = y[i + 4];
+    __m256i c12 = y[i + 12];
+    __m256i c20 = y[i + 20];
+    __m256i c28 = y[i + 28];
+    __m256i d0 = _mm256_unpacklo_epi32(c0,c4);
+    __m256i d4 = _mm256_unpackhi_epi32(c0,c4);
+    __m256i d8 = _mm256_unpacklo_epi32(c8,c12);
+    __m256i d12 = _mm256_unpackhi_epi32(c8,c12);
+    __m256i d16 = _mm256_unpacklo_epi32(c16,c20);
+    __m256i d20 = _mm256_unpackhi_epi32(c16,c20);
+    __m256i d24 = _mm256_unpacklo_epi32(c24,c28);
+    __m256i d28 = _mm256_unpackhi_epi32(c24,c28);
+    __m256i e0 = _mm256_unpacklo_epi64(d0,d8);
+    __m256i e8 = _mm256_unpackhi_epi64(d0,d8);
+    __m256i e16 = _mm256_unpacklo_epi64(d16,d24);
+    __m256i e24 = _mm256_unpackhi_epi64(d16,d24);
+    __m256i e4 = _mm256_unpacklo_epi64(d4,d12);
+    __m256i e12 = _mm256_unpackhi_epi64(d4,d12);
+    __m256i e20 = _mm256_unpacklo_epi64(d20,d28);
+    __m256i e28 = _mm256_unpackhi_epi64(d20,d28);
+    __m256i f0 = _mm256_permute2x128_si256(e0,e16,0x20);
+    __m256i f16 = _mm256_permute2x128_si256(e0,e16,0x31);
+    __m256i f8 = _mm256_permute2x128_si256(e8,e24,0x20);
+    __m256i f24 = _mm256_permute2x128_si256(e8,e24,0x31);
+    __m256i f4 = _mm256_permute2x128_si256(e4,e20,0x20);
+    __m256i f20 = _mm256_permute2x128_si256(e4,e20,0x31);
+    __m256i f12 = _mm256_permute2x128_si256(e12,e28,0x20);
+    __m256i f28 = _mm256_permute2x128_si256(e12,e28,0x31);
+    x[r] = f0;
+    x[r + 16] = f16;
+    x[r + 1] = f8;
+    x[r + 17] = f24;
+    x[r + 2] = f4;
+    x[r + 18] = f20;
+    x[r + 3] = f12;
+    x[r + 19] = f28;
+  }
+}
+
+/* 48*16*int16 f inputs between -2295 and 2295 */
+/* 24*32*int8 g inputs between -1 and 1 */
+/* 96*16*int16 h outputs between -2295 and 2295 */
+static void mult768_mix2_m256i(__m256i h[96],const __m256i f[48],const __m256i g[24])
+{
+  __m256i hkara[24][16];
+  __m256i gkara[3][32];
+#define fkara hkara
+  int i;
+
+  for (i = 6;i-- > 0;) {
+    __m256i f0, f1, f2, f3, f4, f5, f6, f7;
+    __m256i f01, f23, f45, f67;
+    __m256i f02, f46, f04, f26, f0426;
+    __m256i f13, f57, f15, f37, f1537;
+    __m256i f0213, f4657, f04261537, f0415, f2637;
+
+    f0 = _mm256_loadu_si256(&f[i + 0]);
+    f1 = _mm256_loadu_si256(&f[i + 6]);
+    f2 = _mm256_loadu_si256(&f[i + 12]);
+    f3 = _mm256_loadu_si256(&f[i + 18]);
+    f4 = _mm256_loadu_si256(&f[i + 24]);
+    f5 = _mm256_loadu_si256(&f[i + 30]);
+    f6 = _mm256_loadu_si256(&f[i + 36]);
+    f7 = _mm256_loadu_si256(&f[i + 42]);
+    f01 = squeezeadd16(f0,f1); fkara[i][8] = f01;
+    f23 = squeezeadd16(f2,f3); fkara[i][9] = f23;
+    f45 = squeezeadd16(f4,f5); fkara[i][10] = f45;
+    f67 = squeezeadd16(f6,f7); fkara[i][11] = f67;
+
+    fkara[i][0] = f0;
+    fkara[i][2] = f2;
+    fkara[i][4] = f4;
+    fkara[i][6] = f6;
+
+    f02 = squeezeadd16(f0,f2); fkara[i + 6][0] = f02;
+    f04 = squeezeadd16(f0,f4); fkara[i + 6][6] = f04;
+    f46 = squeezeadd16(f4,f6); fkara[i + 6][3] = f46;
+    f26 = squeezeadd16(f2,f6); fkara[i + 6][8] = f26;
+
+    fkara[i][1] = f1;
+    fkara[i][3] = f3;
+    fkara[i][5] = f5;
+    fkara[i][7] = f7;
+
+    f13 = squeezeadd16(f1,f3); fkara[i + 6][1] = f13;
+    f15 = squeezeadd16(f1,f5); fkara[i + 6][7] = f15;
+    f57 = squeezeadd16(f5,f7); fkara[i + 6][4] = f57;
+    f37 = squeezeadd16(f3,f7); fkara[i + 6][9] = f37;
+
+    f0426 = squeezeadd16(f04,f26); fkara[i + 6][12] = f0426;
+    f1537 = squeezeadd16(f15,f37); fkara[i + 6][13] = f1537;
+    f0213 = squeezeadd16(f02,f13); fkara[i + 6][2] = f0213;
+    f4657 = squeezeadd16(f46,f57); fkara[i + 6][5] = f4657;
+    f0415 = squeezeadd16(f04,f15); fkara[i + 6][10] = f0415;
+    f2637 = squeezeadd16(f26,f37); fkara[i + 6][11] = f2637;
+    f04261537 = squeezeadd16(f0426,f1537); fkara[i + 6][14] = f04261537;
+
+    fkara[i][12] = v0;
+    fkara[i][13] = v0;
+    fkara[i][14] = v0;
+    fkara[i][15] = v0;
+    fkara[i + 6][15] = v0;
+  }
+
+  for (i = 3;i-- > 0;) {
+    __m256i g0, g1, g2, g3, g4, g5, g6, g7;
+    __m256i g01, g23, g45, g67;
+    __m256i g02, g46, g04, g26, g0426;
+    __m256i g13, g57, g15, g37, g1537;
+    __m256i g0213, g4657, g04261537, g0415, g2637;
+
+    g0 = _mm256_loadu_si256(&g[i + 0]);
+    g1 = _mm256_loadu_si256(&g[i + 3]);
+    g2 = _mm256_loadu_si256(&g[i + 6]);
+    g3 = _mm256_loadu_si256(&g[i + 9]);
+    g4 = _mm256_loadu_si256(&g[i + 12]);
+    g5 = _mm256_loadu_si256(&g[i + 15]);
+    g6 = _mm256_loadu_si256(&g[i + 18]);
+    g7 = _mm256_loadu_si256(&g[i + 21]);
+    g01 = _mm256_add_epi8(g0,g1); gkara[i][8] = g01;
+    g23 = _mm256_add_epi8(g2,g3); gkara[i][9] = g23;
+    g45 = _mm256_add_epi8(g4,g5); gkara[i][10] = g45;
+    g67 = _mm256_add_epi8(g6,g7); gkara[i][11] = g67;
+
+    gkara[i][0] = g0;
+    gkara[i][2] = g2;
+    gkara[i][4] = g4;
+    gkara[i][6] = g6;
+
+    g02 = _mm256_add_epi8(g0,g2); gkara[i][16] = g02;
+    g04 = _mm256_add_epi8(g0,g4); gkara[i][22] = g04;
+    g46 = _mm256_add_epi8(g4,g6); gkara[i][19] = g46;
+    g26 = _mm256_add_epi8(g2,g6); gkara[i][24] = g26;
+
+    gkara[i][1] = g1;
+    gkara[i][3] = g3;
+    gkara[i][5] = g5;
+    gkara[i][7] = g7;
+
+    g13 = _mm256_add_epi8(g1,g3); gkara[i][17] = g13;
+    g15 = _mm256_add_epi8(g1,g5); gkara[i][23] = g15;
+    g57 = _mm256_add_epi8(g5,g7); gkara[i][20] = g57;
+    g37 = _mm256_add_epi8(g3,g7); gkara[i][25] = g37;
+
+    g0426 = _mm256_add_epi8(g04,g26); gkara[i][28] = g0426;
+    g1537 = _mm256_add_epi8(g15,g37); gkara[i][29] = g1537;
+    g0213 = _mm256_add_epi8(g02,g13); gkara[i][18] = g0213;
+    g4657 = _mm256_add_epi8(g46,g57); gkara[i][21] = g4657;
+    g0415 = _mm256_add_epi8(g04,g15); gkara[i][26] = g0415;
+    g2637 = _mm256_add_epi8(g26,g37); gkara[i][27] = g2637;
+    g04261537 = _mm256_add_epi8(g0426,g1537); gkara[i][30] = g04261537;
+
+    gkara[i][12] = v0;
+    gkara[i][13] = v0;
+    gkara[i][14] = v0;
+    gkara[i][15] = v0;
+    gkara[i][31] = v0;
+  }
+
+  for (i = 12;i-- > 0;)
+    transpose16(fkara[i]);
+  for (i = 3;i-- > 0;)
+    transpose32(gkara[i]);
+
+  mult96x16(hkara[12],fkara[6],(__m256i *) (1 + (__m128i *) gkara));
+  mult96x16(hkara[0],fkara[0],gkara[0]);
+
+  for (i = 24;i-- > 0;)
+    transpose16(hkara[i]);
+
+  for (i = 6;i-- > 0;) {
+    __m256i h0,h1,h2,h3,h4,h5,h6,h7,h8,h9;
+    __m256i h10,h11,h12,h13,h14,h15,h16,h17,h18,h19;
+    __m256i h20,h21,h22,h23;
+    __m256i h32,h33,h34,h35,h36,h37,h38,h39;
+    __m256i h40,h41,h42,h43,h44,h45,h46,h47,h48,h49;
+    __m256i h50,h51,h52,h53,h54,h55,h56,h57,h58,h59;
+    __m256i h60,h61;
+    __m256i c;
+
+#define COMBINE(h0,h1,h2,h3,x0,x1) \
+    c = _mm256_sub_epi16(h1,h2); \
+    h1 = _mm256_sub_epi16(_mm256_add_epi16(c,x0),h0); \
+    h2 = _mm256_sub_epi16(x1,_mm256_add_epi16(c,h3)); \
+    h1 = squeeze(h1); \
+    h2 = squeeze(h2);
+
+    h56 = hkara[i + 12][12];
+    h57 = hkara[i + 18][12];
+    h58 = hkara[i + 12][13];
+    h59 = hkara[i + 18][13];
+    h60 = hkara[i + 12][14];
+    h61 = hkara[i + 18][14];
+    COMBINE(h56,h57,h58,h59,h60,h61)
+
+    h44 = hkara[i + 12][6];
+    h45 = hkara[i + 18][6];
+    h46 = hkara[i + 12][7];
+    h47 = hkara[i + 18][7];
+    h52 = hkara[i + 12][10];
+    h53 = hkara[i + 18][10];
+    COMBINE(h44,h45,h46,h47,h52,h53)
+
+    h48 = hkara[i + 12][8];
+    h49 = hkara[i + 18][8];
+    h50 = hkara[i + 12][9];
+    h51 = hkara[i + 18][9];
+    h54 = hkara[i + 12][11];
+    h55 = hkara[i + 18][11];
+    COMBINE(h48,h49,h50,h51,h54,h55)
+    COMBINE(h44,h46,h48,h50,h56,h58)
+    COMBINE(h45,h47,h49,h51,h57,h59)
+
+    h0 = hkara[i][0];
+    h1 = hkara[i + 6][0];
+    h2 = hkara[i][1];
+    h3 = hkara[i + 6][1];
+    h16 = hkara[i][8];
+    h17 = hkara[i + 6][8];
+    COMBINE(h0,h1,h2,h3,h16,h17)
+
+    h4 = hkara[i][2];
+    h5 = hkara[i + 6][2];
+    h6 = hkara[i][3];
+    h7 = hkara[i + 6][3];
+    h18 = hkara[i][9];
+    h19 = hkara[i + 6][9];
+    COMBINE(h4,h5,h6,h7,h18,h19)
+
+    h32 = hkara[i + 12][0];
+    h33 = hkara[i + 18][0];
+    h34 = hkara[i + 12][1];
+    h35 = hkara[i + 18][1];
+    h36 = hkara[i + 12][2];
+    h37 = hkara[i + 18][2];
+    COMBINE(h32,h33,h34,h35,h36,h37)
+    COMBINE(h1,h3,h5,h7,h33,h35)
+    COMBINE(h0,h2,h4,h6,h32,h34)
+
+    h8 = hkara[i][4];
+    h9 = hkara[i + 6][4];
+    h10 = hkara[i][5];
+    h11 = hkara[i + 6][5];
+    h20 = hkara[i][10];
+    h21 = hkara[i + 6][10];
+    COMBINE(h8,h9,h10,h11,h20,h21)
+
+    h12 = hkara[i][6];
+    h13 = hkara[i + 6][6];
+    h14 = hkara[i][7];
+    h15 = hkara[i + 6][7];
+    h22 = hkara[i][11];
+    h23 = hkara[i + 6][11];
+    COMBINE(h12,h13,h14,h15,h22,h23)
+
+    h38 = hkara[i + 12][3];
+    h39 = hkara[i + 18][3];
+    h40 = hkara[i + 12][4];
+    h41 = hkara[i + 18][4];
+    h42 = hkara[i + 12][5];
+    h43 = hkara[i + 18][5];
+    COMBINE(h38,h39,h40,h41,h42,h43)
+    COMBINE(h8,h10,h12,h14,h38,h40)
+    COMBINE(h9,h11,h13,h15,h39,h41)
+
+    COMBINE(h0,h4,h8,h12,h44,h48)
+    h0 = freeze(h0);
+    h4 = freeze(h4);
+    h8 = freeze(h8);
+    h12 = freeze(h12);
+    _mm256_storeu_si256(&h[i + 0],h0);
+    _mm256_storeu_si256(&h[i + 24],h4);
+    _mm256_storeu_si256(&h[i + 48],h8);
+    _mm256_storeu_si256(&h[i + 72],h12);
+
+    COMBINE(h1,h5,h9,h13,h45,h49)
+    h1 = freeze(h1);
+    h5 = freeze(h5);
+    h9 = freeze(h9);
+    h13 = freeze(h13);
+    _mm256_storeu_si256(&h[i + 6],h1);
+    _mm256_storeu_si256(&h[i + 30],h5);
+    _mm256_storeu_si256(&h[i + 54],h9);
+    _mm256_storeu_si256(&h[i + 78],h13);
+
+    COMBINE(h2,h6,h10,h14,h46,h50)
+    h2 = freeze(h2);
+    h6 = freeze(h6);
+    h10 = freeze(h10);
+    h14 = freeze(h14);
+    _mm256_storeu_si256(&h[i + 12],h2);
+    _mm256_storeu_si256(&h[i + 36],h6);
+    _mm256_storeu_si256(&h[i + 60],h10);
+    _mm256_storeu_si256(&h[i + 84],h14);
+
+    COMBINE(h3,h7,h11,h15,h47,h51)
+    h3 = freeze(h3);
+    h7 = freeze(h7);
+    h11 = freeze(h11);
+    h15 = freeze(h15);
+    _mm256_storeu_si256(&h[i + 18],h3);
+    _mm256_storeu_si256(&h[i + 42],h7);
+    _mm256_storeu_si256(&h[i + 66],h11);
+    _mm256_storeu_si256(&h[i + 90],h15);
+  }
+}
+
+#define p 761
+
+/* 761 f inputs between -2295 and 2295 */
+/* 761 g inputs between -1 and 1 */
+/* 761 h outputs between -2295 and 2295 */
+void rq_mult(modq *h,const modq *f,const small *g)
+{
+  __m256i fgvec[96];
+  modq *fg;
+  int i;
+
+  mult768_mix2_m256i(fgvec,(__m256i *) f,(__m256i *) g);
+  fg = (modq *) fgvec;
+
+  h[0] = modq_freeze(fg[0] + fg[p]);
+  for (i = 1;i < 9;++i)
+    h[i] = modq_freeze(fg[i] + fg[i + p - 1] + fg[i + p]);
+  for (i = 9;i < 761;i += 16) {
+    __m256i fgi = _mm256_loadu_si256((__m256i *) &fg[i]);
+    __m256i fgip = _mm256_loadu_si256((__m256i *) &fg[i + p]);
+    __m256i fgip1 = _mm256_loadu_si256((__m256i *) &fg[i + p - 1]);
+    __m256i x = _mm256_add_epi16(fgi,_mm256_add_epi16(fgip,fgip1));
+    x = freeze(squeeze(x));
+    _mm256_storeu_si256((__m256i *) &h[i],x);
+  }
+  for (i = 761;i < 768;++i)
+    h[i] = 0;
+}
+
+void r3_mult(small *h,const small *f,const small *g)
+{
+  __m256i fgvec[96];
+  __m256i fvec[48];
+  modq *fg;
+  int i;
+
+  memset(fvec,0,sizeof fvec);
+
+  for (i = 0;i < 761;++i)
+    i[(modq *) fvec] = f[i];
+
+  mult768_mix2_m256i(fgvec,fvec,(__m256i *) g);
+  fg = (modq *) fgvec;
+
+  h[0] = mod3_freeze(fg[0] + fg[p]);
+  for (i = 1;i < p;++i)
+    h[i] = mod3_freeze(fg[i] + fg[i + p - 1] + fg[i + p]);
+  for (i = p;i < 768;++i)
+    h[i] = 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/params.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/params.h
@ -0,0 +1,14 @@
+#ifndef params_h
+#define params_h
+
+#define q 4591
+/* XXX: also built into modq in various ways */
+
+#define qshift 2295
+#define p 761
+#define w 286
+
+#define rq_encode_len 1218
+#define small_encode_len 191
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3.h
@ -0,0 +1,15 @@
+#ifndef r3_h
+#define r3_h
+
+#include "small.h"
+
+#define r3_mult crypto_kem_sntrup4591761_avx_r3_mult
+extern void r3_mult(small *,const small *,const small *);
+
+#define r3_recip crypto_kem_sntrup4591761_avx_r3_recip
+extern int r3_recip(small *,const small *);
+
+#define r3_weightw_mask crypto_kem_sntrup4591761_avx_r3_weightw_mask
+extern int r3_weightw_mask(const small *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3_recip.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3_recip.c
@ -0,0 +1,192 @@
+#include <immintrin.h>
+#include "params.h"
+#include "mod3.h"
+#include "swap.h"
+#include "r3.h"
+
+/* caller must ensure that x-y does not overflow */
+static int smaller_mask(int x,int y)
+{
+  return (x - y) >> 31;
+}
+
+static void vectormod3_product(small *z,int len,const small *x,const small c)
+{
+  int i;
+  int minusmask = c;
+  int plusmask = -c;
+  __m256i minusvec, plusvec, zerovec;
+
+  minusmask >>= 31;
+  plusmask >>= 31;
+  minusvec = _mm256_set1_epi32(minusmask);
+  plusvec = _mm256_set1_epi32(plusmask);
+  zerovec = _mm256_set1_epi32(0);
+
+  while (len >= 32) {
+    __m256i xi = _mm256_loadu_si256((__m256i *) x);
+    xi = (xi & plusvec) | (_mm256_sub_epi8(zerovec,xi) & minusvec);
+    _mm256_storeu_si256((__m256i *) z,xi);
+    x += 32;
+    z += 32;
+    len -= 32;
+  }
+
+  for (i = 0;i < len;++i) z[i] = mod3_product(x[i],c);
+}
+
+static void vectormod3_minusproduct(small *z,int len,const small *x,const small *y,const small c)
+{
+  int i;
+  int minusmask = c;
+  int plusmask = -c;
+  __m256i minusvec, plusvec, zerovec, twovec, fourvec;
+
+  minusmask >>= 31;
+  plusmask >>= 31;
+  minusvec = _mm256_set1_epi32(minusmask);
+  plusvec = _mm256_set1_epi32(plusmask);
+  zerovec = _mm256_set1_epi32(0);
+  twovec = _mm256_set1_epi32(0x02020202);
+  fourvec = _mm256_set1_epi32(0x04040404);
+
+  while (len >= 32) {
+    __m256i xi = _mm256_loadu_si256((__m256i *) x);
+    __m256i yi = _mm256_loadu_si256((__m256i *) y);
+    __m256i r;
+    yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec,yi) & minusvec);
+    xi = _mm256_sub_epi8(xi,yi);
+
+    r = _mm256_add_epi8(xi,twovec);
+    r &= fourvec;
+    r = _mm256_srli_epi32(r,2);
+    xi = _mm256_sub_epi8(xi,r);
+    r = _mm256_add_epi8(r,r);
+    xi = _mm256_sub_epi8(xi,r);
+
+    r = _mm256_sub_epi8(twovec,xi);
+    r &= fourvec;
+    r = _mm256_srli_epi32(r,2);
+    xi = _mm256_add_epi8(xi,r);
+    r = _mm256_add_epi8(r,r);
+    xi = _mm256_add_epi8(xi,r);
+
+    _mm256_storeu_si256((__m256i *) z,xi);
+    x += 32;
+    y += 32;
+    z += 32;
+    len -= 32;
+  }
+
+  for (i = 0;i < len;++i) z[i] = mod3_minusproduct(x[i],y[i],c);
+}
+
+static void vectormod3_shift(small *z,int len)
+{
+  int i;
+  while (len >= 33) {
+    __m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 33));
+    _mm256_storeu_si256((__m256i *) (z + len - 32),zi);
+    len -= 32;
+  }
+  for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
+  z[0] = 0;
+}
+
+/*
+r = s^(-1) mod m, returning 0, if s is invertible mod m
+or returning -1 if s is not invertible mod m
+r,s are polys of degree <p
+m is x^p-x-1
+*/
+int r3_recip(small *r,const small *s)
+{
+  const int loops = 2*p + 1;
+  int loop;
+  small f[768]; 
+  small g[769]; 
+  small u[1536];
+  small v[1537];
+  small c;
+  int i;
+  int d = p;
+  int e = p;
+  int swapmask;
+
+  for (i = 2;i < p;++i) f[i] = 0;
+  f[0] = -1;
+  f[1] = -1;
+  f[p] = 1;
+  /* generalization: can initialize f to any polynomial m */
+  /* requirements: m has degree exactly p, nonzero constant coefficient */
+
+  for (i = 0;i < p;++i) g[i] = s[i];
+  g[p] = 0;
+
+  for (i = 0;i <= loops;++i) u[i] = 0;
+
+  v[0] = 1;
+  for (i = 1;i <= loops;++i) v[i] = 0;
+
+  loop = 0;
+  for (;;) {
+    /* e == -1 or d + e + loop <= 2*p */
+
+    /* f has degree p: i.e., f[p]!=0 */
+    /* f[i]==0 for i < p-d */
+
+    /* g has degree <=p (so it fits in p+1 coefficients) */
+    /* g[i]==0 for i < p-e */
+
+    /* u has degree <=loop (so it fits in loop+1 coefficients) */
+    /* u[i]==0 for i < p-d */
+    /* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
+
+    /* v has degree <=loop (so it fits in loop+1 coefficients) */
+    /* v[i]==0 for i < p-e */
+    /* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
+
+    if (loop >= loops) break;
+
+    c = mod3_quotient(g[p],f[p]);
+
+    vectormod3_minusproduct(g,768,g,f,c);
+    vectormod3_shift(g,769);
+
+#ifdef SIMPLER
+    vectormod3_minusproduct(v,1536,v,u,c);
+    vectormod3_shift(v,1537);
+#else
+    if (loop < p) {
+      vectormod3_minusproduct(v,loop + 1,v,u,c);
+      vectormod3_shift(v,loop + 2);
+    } else {
+      vectormod3_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
+      vectormod3_shift(v + loop - p,p + 2);
+    }
+#endif
+
+    e -= 1;
+
+    ++loop;
+
+    swapmask = smaller_mask(e,d) & mod3_nonzero_mask(g[p]);
+    swap(&e,&d,sizeof e,swapmask);
+    swap(f,g,(p + 1) * sizeof(small),swapmask);
+
+#ifdef SIMPLER
+    swap(u,v,1536 * sizeof(small),swapmask);
+#else
+    if (loop < p) {
+      swap(u,v,(loop + 1) * sizeof(small),swapmask);
+    } else {
+      swap(u + loop - p,v + loop - p,(p + 1) * sizeof(small),swapmask);
+    }
+#endif
+  }
+
+  c = mod3_reciprocal(f[p]);
+  vectormod3_product(r,p,u + p,c);
+  for (i = p;i < 768;++i) r[i] = 0;
+  return smaller_mask(0,d);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomsmall.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomsmall.c
@ -0,0 +1,17 @@
+#include "params.h"
+#include "randombytes.h"
+#include "crypto_uint32.h"
+#include "small.h"
+
+void small_random(small *g)
+{
+  crypto_uint32 r[p];
+  int i;
+
+  randombytes((unsigned char *) r,sizeof r);
+  for (i = 0;i < p;++i)
+    g[i] = (small) (((r[i] & 1073741823) * 3) >> 30) - 1;
+    /* bias is miniscule */
+  for (i = p;i < 768;++i)
+    g[i] = 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomweightw.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomweightw.c
@ -0,0 +1,17 @@
+#include "params.h"
+#include "randombytes.h"
+#include "int32_sort.h"
+#include "small.h"
+
+void small_random_weightw(small *f)
+{
+  crypto_int32 r[p];
+  int i;
+
+  randombytes((unsigned char *) r,sizeof r);
+  for (i = 0;i < w;++i) r[i] &= -2;
+  for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1;
+  int32_sort(r,p);
+  for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1;
+  for (i = p;i < 768;++i) f[i] = 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.c
@ -0,0 +1,158 @@
+#include <immintrin.h>
+#include "params.h"
+#include "crypto_uint32.h"
+#include "crypto_int64.h"
+#include "rq.h"
+
+#define v2295_16 _mm256_set1_epi16(2295)
+#define v2295_16_128 _mm_set1_epi16(2295)
+#define alpha_top _mm256_set1_epi32(0x43380000)
+#define alpha _mm256_set1_pd(6755399441055744.0)
+#define alpha_64 _mm256_set1_epi64(0x4338000000000000)
+
+/* each reciprocal is rounded _up_ to nearest floating-point number */
+#define recip54 0.0185185185185185209599811884118025773204863071441650390625
+#define recip4591 0.000217817468961010681817447309782664888189174234867095947265625
+#define recip6144 0.0001627604166666666847367028747584072334575466811656951904296875
+#define recip331776 0.00000301408179012345704632478034235010255770248477347195148468017578125
+#define recip37748736 0.000000026490953233506946282623583451172610825352649044361896812915802001953125
+
+#define broadcast(r) _mm256_set1_pd(r)
+#define floor(x) _mm256_floor_pd(x)
+
+void rq_encode(unsigned char *c,const modq *f)
+{
+  crypto_int32 f0, f1, f2, f3, f4;
+  int i;
+
+  for (i = 0;i < p/5;++i) {
+    f0 = *f++ + qshift;
+    f1 = *f++ + qshift;
+    f2 = *f++ + qshift;
+    f3 = *f++ + qshift;
+    f4 = *f++ + qshift;
+    /* now want f0 + 6144*f1 + ... as a 64-bit integer */
+    f1 *= 3;
+    f2 *= 9;
+    f3 *= 27;
+    f4 *= 81;
+    /* now want f0 + f1<<11 + f2<<22 + f3<<33 + f4<<44 */
+    f0 += f1 << 11;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    f0 += f2 << 6;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    f0 += f3 << 1;
+    *c++ = f0; f0 >>= 8;
+    f0 += f4 << 4;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0;
+  }
+  /* XXX: using p mod 5 = 1 */
+  f0 = *f++ + qshift;
+  *c++ = f0; f0 >>= 8;
+  *c++ = f0;
+}
+
+void rq_decode(modq *f,const unsigned char *c)
+{
+  crypto_uint32 c0, c1;
+  int i;
+
+  for (i = 0;i < 152;i += 4) {
+    __m256i abcd, ac, bd, abcd0, abcd1;
+    __m256d x0, x1, f4, f3, f2, f1, f0;
+    __m128i if4, if3, if2, if1, if0;
+    __m128i x01, x23, x02, x13, xab, xcd;
+
+    /* f0 + f1*6144 + f2*6144^2 + f3*6144^3 + f4*6144^4 */
+    /* = c0 + c1*256 + ... + c6*256^6 + c7*256^7 */
+    /* with each f between 0 and 4590 */
+
+    /* could use _mm256_cvtepi32_pd instead; but beware uint32 */
+
+    abcd = _mm256_loadu_si256((__m256i *) c); /* a0 a1 b0 b1 c0 c1 d0 d1 */
+    c += 32;
+
+    ac = _mm256_unpacklo_epi32(abcd,alpha_top); /* a0 a1 c0 c1 */
+    bd = _mm256_unpackhi_epi32(abcd,alpha_top); /* b0 b1 d0 d1 */
+    abcd1 = _mm256_unpackhi_epi64(ac,bd); /* a1 b1 c1 d1 */
+    abcd0 = _mm256_unpacklo_epi64(ac,bd); /* a0 b0 c0 d0 */
+    x1 = *(__m256d *) &abcd1;
+    x0 = *(__m256d *) &abcd0;
+    
+    x1 -= alpha;
+    x0 -= alpha;
+
+    /* x1 is [0,41] + [0,4590]*54 + f4*331776 */
+    f4 = broadcast(recip331776) * x1;
+    f4 = floor(f4);
+    x1 -= broadcast(331776.0) * f4;
+
+    /* x1 is [0,41] + f3*54 */
+    f3 = broadcast(recip54) * x1;
+    f3 = floor(f3);
+    x1 -= broadcast(54.0) * f3;
+
+    x0 += broadcast(4294967296.0) * x1;
+
+    /* x0 is [0,4590] + [0,4590]*6144 + f2*6144^2 */
+    f2 = broadcast(recip37748736) * x0;
+    f2 = floor(f2);
+    x0 -= broadcast(37748736.0) * f2;
+
+    /* x0 is [0,4590] + f1*6144 */
+    f1 = broadcast(recip6144) * x0;
+    f1 = floor(f1);
+    x0 -= broadcast(6144.0) * f1;
+
+    f0 = x0;
+
+    f4 -= broadcast(4591.0) * floor(broadcast(recip4591) * f4);
+    f3 -= broadcast(4591.0) * floor(broadcast(recip4591) * f3);
+    f2 -= broadcast(4591.0) * floor(broadcast(recip4591) * f2);
+    f1 -= broadcast(4591.0) * floor(broadcast(recip4591) * f1);
+    f0 -= broadcast(4591.0) * floor(broadcast(recip4591) * f0);
+
+    if4 = _mm256_cvtpd_epi32(f4); /* a4 0 b4 0 c4 0 d4 0 */
+    if3 = _mm256_cvtpd_epi32(f3); /* a3 0 b3 0 c3 0 d3 0 */
+    if2 = _mm256_cvtpd_epi32(f2); /* a2 0 b2 0 c2 0 d2 0 */
+    if1 = _mm256_cvtpd_epi32(f1); /* a1 0 b1 0 c1 0 d1 0 */
+    if0 = _mm256_cvtpd_epi32(f0); /* a0 0 b0 0 c0 0 d0 0 */
+
+    if4 = _mm_sub_epi16(if4,v2295_16_128);
+    f[4] = _mm_extract_epi32(if4,0);
+    f[9] = _mm_extract_epi32(if4,1);
+    f[14] = _mm_extract_epi32(if4,2);
+    f[19] = _mm_extract_epi32(if4,3);
+
+    x23 = _mm_packs_epi32(if2,if3); /* a2 b2 c2 d2 a3 b3 c3 d3 */
+    x01 = _mm_packs_epi32(if0,if1); /* a0 b0 c0 d0 a1 b1 c1 d1 */
+    x02 = _mm_unpacklo_epi16(x01,x23); /* a0 a2 b0 b2 c0 c2 d0 d2 */
+    x13 = _mm_unpackhi_epi16(x01,x23); /* a1 a3 b1 b3 c1 c3 d1 d3 */
+    xab = _mm_unpacklo_epi16(x02,x13); /* a0 a1 a2 a3 b0 b1 b2 b3 */
+    xcd = _mm_unpackhi_epi16(x02,x13); /* c0 c1 c2 c3 d0 d1 d2 d3 */
+    xab = _mm_sub_epi16(xab,v2295_16_128);
+    xcd = _mm_sub_epi16(xcd,v2295_16_128);
+
+    *(crypto_int64 *) (f + 0) = _mm_extract_epi64(xab,0);
+    *(crypto_int64 *) (f + 5) = _mm_extract_epi64(xab,1);
+    *(crypto_int64 *) (f + 10) = _mm_extract_epi64(xcd,0);
+    *(crypto_int64 *) (f + 15) = _mm_extract_epi64(xcd,1);
+    f += 20;
+  }
+
+  c0 = *c++;
+  c1 = *c++;
+  c0 += c1 << 8;
+  *f++ = modq_freeze(c0 + q - qshift);
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.h
@ -0,0 +1,31 @@
+#ifndef rq_h
+#define rq_h
+
+#include "modq.h"
+#include "small.h"
+
+#define rq_encode crypto_kem_sntrup4591761_avx_rq_encode
+extern void rq_encode(unsigned char *,const modq *);
+
+#define rq_decode crypto_kem_sntrup4591761_avx_rq_decode
+extern void rq_decode(modq *,const unsigned char *);
+
+#define rq_roundencode crypto_kem_sntrup4591761_avx_rq_roundencode
+extern void rq_roundencode(unsigned char *,const modq *);
+
+#define rq_decoderounded crypto_kem_sntrup4591761_avx_rq_decoderounded
+extern void rq_decoderounded(modq *,const unsigned char *);
+
+#define rq_round3 crypto_kem_sntrup4591761_avx_rq_round3
+extern void rq_round3(modq *,const modq *);
+
+#define rq_mod3 crypto_kem_sntrup4591761_avx_rq_mod3
+extern void rq_mod3(small *,const modq *);
+
+#define rq_mult crypto_kem_sntrup4591761_avx_rq_mult
+extern void rq_mult(modq *,const modq *,const small *);
+
+#define rq_recip3 crypto_kem_sntrup4591761_avx_rq_recip3
+int rq_recip3(modq *,const small *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_mod3.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_mod3.c
@ -0,0 +1,49 @@
+#include <immintrin.h>
+#include "mod3.h"
+#include "rq.h"
+
+#define v3 _mm256_set1_epi16(3)
+#define v7 _mm256_set1_epi16(7)
+#define v2296_16 _mm256_set1_epi16(2296)
+#define v4591_16 _mm256_set1_epi16(4591)
+#define v10923_16 _mm256_set1_epi16(10923)
+
+static inline __m256i squeeze(__m256i x)
+{
+  __m256i q = _mm256_mulhrs_epi16(x,v7);
+  q = _mm256_mullo_epi16(q,v4591_16);
+  return _mm256_sub_epi16(x,q);
+}
+
+static inline __m256i freeze(__m256i x)
+{
+  __m256i mask, x2296, x4591;
+  x4591 = _mm256_add_epi16(x,v4591_16);
+  mask = _mm256_srai_epi16(x,15);
+  x = _mm256_blendv_epi8(x,x4591,mask);
+  x2296 = _mm256_sub_epi16(x,v2296_16);
+  mask = _mm256_srai_epi16(x2296,15);
+  x4591 = _mm256_sub_epi16(x,v4591_16);
+  x = _mm256_blendv_epi8(x4591,x,mask);
+  return x;
+}
+
+void rq_mod3(small *g,const modq *f)
+{
+  int i;
+
+  for (i = 0;i < 768;i += 16) {
+    __m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
+    __m256i q;
+    x = _mm256_mullo_epi16(x,v3);
+    x = squeeze(x);
+    x = freeze(x);
+    q = _mm256_mulhrs_epi16(x,v10923_16);
+    x = _mm256_sub_epi16(x,q);
+    q = _mm256_add_epi16(q,q);
+    x = _mm256_sub_epi16(x,q); /* g0 g1 ... g15 */
+    x = _mm256_packs_epi16(x,x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */
+    0[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,0),0);
+    1[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,1),0);
+  }
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_recip3.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_recip3.c
@ -0,0 +1,215 @@
+#include <immintrin.h>
+#include "params.h"
+#include "swap.h"
+#include "rq.h"
+
+#define v7 _mm256_set1_epi16(7)
+#define v1827_16 _mm256_set1_epi16(1827)
+#define v4591_16 _mm256_set1_epi16(4591)
+#define v29234_16 _mm256_set1_epi16(29234)
+
+/* caller must ensure that x-y does not overflow */
+static int smaller_mask(int x,int y)
+{
+  return (x - y) >> 31;
+}
+
+static inline __m256i product(__m256i x,__m256i y)
+{
+  __m256i lo, hi, r0, r1, t0, t1, t, s0, s1;
+
+  lo = _mm256_mullo_epi16(x,y);
+  hi = _mm256_mulhi_epi16(x,y);
+  r0 = _mm256_unpacklo_epi16(lo,hi);
+  r1 = _mm256_unpackhi_epi16(lo,hi);
+
+  t0 = _mm256_srai_epi32(r0,16);
+  t1 = _mm256_srai_epi32(r1,16);
+  t = _mm256_packs_epi32(t0,t1);
+  t = _mm256_mulhrs_epi16(t,v29234_16);
+  lo = _mm256_mullo_epi16(t,v4591_16);
+  hi = _mm256_mulhi_epi16(t,v4591_16);
+  s0 = _mm256_unpacklo_epi16(lo,hi);
+  s1 = _mm256_unpackhi_epi16(lo,hi);
+  s0 = _mm256_slli_epi32(s0,4);
+  s1 = _mm256_slli_epi32(s1,4);
+  r0 = _mm256_sub_epi32(r0,s0);
+  r1 = _mm256_sub_epi32(r1,s1);
+
+  t0 = _mm256_srai_epi32(r0,8);
+  t1 = _mm256_srai_epi32(r1,8);
+  t = _mm256_packs_epi32(t0,t1);
+  t = _mm256_mulhrs_epi16(t,v1827_16);
+  lo = _mm256_mullo_epi16(t,v4591_16);
+  hi = _mm256_mulhi_epi16(t,v4591_16);
+  s0 = _mm256_unpacklo_epi16(lo,hi);
+  s1 = _mm256_unpackhi_epi16(lo,hi);
+  r0 = _mm256_sub_epi32(r0,s0);
+  r1 = _mm256_sub_epi32(r1,s1);
+
+  x = _mm256_packs_epi32(r0,r1);
+  return x;
+}
+
+static inline __m256i minusproduct(__m256i x,__m256i y,__m256i z)
+{
+  __m256i t;
+
+  x = _mm256_sub_epi16(x,product(y,z));
+  t = _mm256_mulhrs_epi16(x,v7);
+  t = _mm256_mullo_epi16(t,v4591_16);
+  x = _mm256_sub_epi16(x,t);
+  return x;
+}
+
+static void vectormodq_product(modq *z,int len,const modq *x,const modq c)
+{
+  __m256i cvec = _mm256_set1_epi16(c);
+  while (len >= 16) {
+    __m256i xi = _mm256_loadu_si256((__m256i *) x);
+    xi = product(xi,cvec);
+    _mm256_storeu_si256((__m256i *) z,xi);
+    x += 16;
+    z += 16;
+    len -= 16;
+  }
+  while (len > 0) {
+    *z = modq_product(*x,c);
+    ++x;
+    ++z;
+    --len;
+  }
+}
+
+static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,const modq c)
+{
+  __m256i cvec = _mm256_set1_epi16(c);
+  while (len >= 16) {
+    __m256i xi = _mm256_loadu_si256((__m256i *) x);
+    __m256i yi = _mm256_loadu_si256((__m256i *) y);
+    xi = minusproduct(xi,yi,cvec);
+    _mm256_storeu_si256((__m256i *) z,xi);
+    x += 16;
+    y += 16;
+    z += 16;
+    len -= 16;
+  }
+  while (len > 0) {
+    *z = modq_minusproduct(*x,*y,c);
+    ++x;
+    ++y;
+    ++z;
+    --len;
+  }
+}
+
+static void vectormodq_shift(modq *z,int len)
+{
+  int i;
+  while (len >= 17) {
+    __m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 17));
+    _mm256_storeu_si256((__m256i *) (z + len - 16),zi);
+    len -= 16;
+  }
+  for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
+  z[0] = 0;
+}
+
+/*
+r = (3s)^(-1) mod m, returning 0, if s is invertible mod m
+or returning -1 if s is not invertible mod m
+r,s are polys of degree <p
+m is x^p-x-1
+*/
+int rq_recip3(modq *r,const small *s)
+{
+  const int loops = 2*p + 1;
+  int loop;
+  modq f[768];
+  modq g[769];
+  modq u[1536];
+  modq v[1537];
+  modq c;
+  int i;
+  int d = p;
+  int e = p;
+  int swapmask;
+
+  for (i = 2;i < p;++i) f[i] = 0;
+  f[0] = -1;
+  f[1] = -1;
+  f[p] = 1;
+  /* generalization: can initialize f to any polynomial m */
+  /* requirements: m has degree exactly p, nonzero constant coefficient */
+
+  for (i = 0;i < p;++i) g[i] = 3 * s[i];
+  g[p] = 0;
+
+  for (i = 0;i <= loops;++i) u[i] = 0;
+
+  v[0] = 1;
+  for (i = 1;i <= loops;++i) v[i] = 0;
+
+  loop = 0;
+  for (;;) {
+    /* e == -1 or d + e + loop <= 2*p */
+
+    /* f has degree p: i.e., f[p]!=0 */
+    /* f[i]==0 for i < p-d */
+
+    /* g has degree <=p (so it fits in p+1 coefficients) */
+    /* g[i]==0 for i < p-e */
+
+    /* u has degree <=loop (so it fits in loop+1 coefficients) */
+    /* u[i]==0 for i < p-d */
+    /* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
+
+    /* v has degree <=loop (so it fits in loop+1 coefficients) */
+    /* v[i]==0 for i < p-e */
+    /* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
+
+    if (loop >= loops) break;
+
+    c = modq_quotient(g[p],f[p]);
+
+    vectormodq_minusproduct(g,768,g,f,c);
+    vectormodq_shift(g,769);
+
+#ifdef SIMPLER
+    vectormodq_minusproduct(v,1536,v,u,c);
+    vectormodq_shift(v,1537);
+#else
+    if (loop < p) {
+      vectormodq_minusproduct(v,loop + 1,v,u,c);
+      vectormodq_shift(v,loop + 2);
+    } else {
+      vectormodq_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
+      vectormodq_shift(v + loop - p,p + 2);
+    }
+#endif
+
+    e -= 1;
+
+    ++loop;
+
+    swapmask = smaller_mask(e,d) & modq_nonzero_mask(g[p]);
+    swap(&e,&d,sizeof e,swapmask);
+    swap(f,g,768 * sizeof(modq),swapmask);
+
+#ifdef SIMPLER
+    swap(u,v,1536 * sizeof(modq),swapmask);
+#else
+    if (loop < p) {
+      swap(u,v,(loop + 1) * sizeof(modq),swapmask);
+    } else {
+      swap(u + loop - p,v + loop - p,(p + 1) * sizeof(modq),swapmask);
+    }
+#endif
+  }
+
+  c = modq_reciprocal(f[p]);
+  vectormodq_product(r,p,u + p,c);
+  for (i = 0;i < p;++i) r[i] = modq_freeze(r[i]);
+  for (i = p;i < 768;++i) r[i] = 0;
+  return smaller_mask(0,d);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_round3.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_round3.c
@ -0,0 +1,20 @@
+#include <immintrin.h>
+#include "params.h"
+#include "rq.h"
+
+#define v3_16 _mm256_set1_epi16(3)
+#define v10923_16 _mm256_set1_epi16(10923)
+
+void rq_round3(modq *h,const modq *f)
+{
+  int i;
+
+  for (i = 0;i < 768;i += 16) {
+    __m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
+    __m256i x2;
+    x = _mm256_mulhrs_epi16(x,v10923_16);
+    x2 = _mm256_add_epi16(x,x);
+    x = _mm256_add_epi16(x,x2);
+    _mm256_storeu_si256((__m256i *) &h[i],x);
+  }
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_rounded.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_rounded.c
@ -0,0 +1,260 @@
+#include <immintrin.h>
+#include "params.h"
+#include "crypto_uint32.h"
+#include "rq.h"
+
+#define alpha_top _mm256_set1_epi32(0x43380000)
+#define alpha _mm256_set1_pd(6755399441055744.0)
+#define v10923_16 _mm256_set1_epi16(10923)
+#define floor(x) _mm256_floor_pd(x)
+
+void rq_roundencode(unsigned char *c,const modq *f)
+{
+  int i;
+  __m256i h[50];
+
+  for (i = 0;i < 208;i += 16) {
+    __m256i a0, a1, a2, b0, b1, b2, c0, c1, c2, d0, d1, d2;
+    __m256i e0, e1, f0, f1, g0, g1;
+    a0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[0]));
+    a1 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[8]));
+    a2 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[16]));
+    a0 = _mm256_inserti128_si256(a0,_mm_loadu_si128((__m128i *) &f[24]),1);
+    a1 = _mm256_inserti128_si256(a1,_mm_loadu_si128((__m128i *) &f[32]),1);
+    a2 = _mm256_inserti128_si256(a2,_mm_loadu_si128((__m128i *) &f[40]),1);
+    f += 48;
+
+    a0 = _mm256_mulhrs_epi16(a0,v10923_16);
+    a1 = _mm256_mulhrs_epi16(a1,v10923_16);
+    a2 = _mm256_mulhrs_epi16(a2,v10923_16);
+
+    /* a0: a0 a1 a2 b0 b1 b2 c0 c1 and similar second half */
+    /* a1: c2 d0 d1 d2 e0 e1 e2 f0 */
+    /* a2: f1 f2 g0 g1 g2 h0 h1 h2 */
+
+    b1 = _mm256_blend_epi16(a2,a0,0xf0);
+    b1 = _mm256_shuffle_epi32(b1,0x4e);
+    b0 = _mm256_blend_epi16(a0,a1,0xf0);
+    b2 = _mm256_blend_epi16(a1,a2,0xf0);
+    /* XXX: use shufps instead? */
+
+    /* b0: a0 a1 a2 b0 e0 e1 e2 f0 */
+    /* b1: b1 b2 c0 c1 f1 f2 g0 g1 */
+    /* b2: c2 d0 d1 d2 g2 h0 h1 h2 */
+
+    c1 = _mm256_blend_epi16(b2,b0,0xcc);
+    c1 = _mm256_shuffle_epi32(c1,0xb1);
+    c0 = _mm256_blend_epi16(b0,b1,0xcc);
+    c2 = _mm256_blend_epi16(b1,b2,0xcc);
+
+    /* c0: a0 a1 c0 c1 e0 e1 g0 g1 */
+    /* c1: a2 b0 c2 d0 e2 f0 g2 h0 */
+    /* c2: b1 b2 d1 d2 f1 f2 h1 h2 */
+
+    d1 = _mm256_blend_epi16(c2,c0,0xaa);
+    d1 = _mm256_shufflelo_epi16(d1,0xb1);
+    d1 = _mm256_shufflehi_epi16(d1,0xb1);
+    d0 = _mm256_blend_epi16(c0,c1,0xaa);
+    d2 = _mm256_blend_epi16(c1,c2,0xaa);
+
+    /* d0: a0 b0 c0 d0 e0 f0 g0 h0 */
+    /* d1: a1 b1 c1 d1 e1 f1 g1 h1 */
+    /* d2: a2 b2 c2 d2 e2 f2 g2 h2 */
+
+    d0 = _mm256_add_epi16(d0,_mm256_set1_epi16(765));
+    d1 = _mm256_add_epi16(d1,_mm256_set1_epi16(765));
+    d2 = _mm256_add_epi16(d2,_mm256_set1_epi16(765));
+    /* want bytes of d0 + 1536*d1 + 1536*1536*d2 */
+
+    e0 = d0 & _mm256_set1_epi16(0xff);
+    d0 = _mm256_srli_epi16(d0,8);
+    /* want e0, d0 + 6*d1 + 6*1536*d2 */
+
+    d1 = _mm256_mullo_epi16(d1,_mm256_set1_epi16(6));
+    d0 = _mm256_add_epi16(d0,d1);
+    /* want e0, d0 + 6*1536*d2 */
+
+    e1 = _mm256_slli_epi16(d0,8);
+    e0 = _mm256_add_epi16(e0,e1);
+    d0 = _mm256_srli_epi16(d0,8);
+    /* want e0, d0 + 36*d2 */
+
+    d2 = _mm256_mullo_epi16(d2,_mm256_set1_epi16(36));
+    e1 = _mm256_add_epi16(d0,d2);
+    /* want e0, e1 */
+
+    /* e0: out0 out1 out4 out5 out8 out9 ... */
+    /* e1: out2 out3 out6 out7 out10 out11 ... */
+
+    f0 = _mm256_unpacklo_epi16(e0,e1);
+    f1 = _mm256_unpackhi_epi16(e0,e1);
+
+    g0 = _mm256_permute2x128_si256(f0,f1,0x20);
+    g1 = _mm256_permute2x128_si256(f0,f1,0x31);
+
+    _mm256_storeu_si256((__m256i *) c,g0);
+    _mm256_storeu_si256((__m256i *) (c + 32),g1);
+    c += 64;
+  }
+
+  for (i = 0;i < 9;++i) {
+    __m256i x = _mm256_loadu_si256((__m256i *) &f[16 * i]);
+    _mm256_storeu_si256(&h[i],_mm256_mulhrs_epi16(x,v10923_16));
+  }
+  f = (const modq *) h;
+
+  for (i = 208;i < 253;++i) {
+    crypto_int32 f0, f1, f2;
+    f0 = *f++;
+    f1 = *f++;
+    f2 = *f++;
+    f0 += 1806037245;
+    f1 *= 3;
+    f2 *= 9;
+    f0 += f1 << 9;
+    f0 += f2 << 18;
+    *(crypto_int32 *) c = f0;
+    c += 4;
+  }
+  {
+    crypto_int32 f0, f1;
+    f0 = *f++;
+    f1 = *f++;
+    f0 += 1175805;
+    f1 *= 3;
+    f0 += f1 << 9;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0;
+  }
+}
+
+void rq_decoderounded(modq *f,const unsigned char *c)
+{
+  crypto_uint32 c0, c1, c2, c3;
+  crypto_uint32 f0, f1, f2;
+  int i;
+
+  for (i = 0;i < 248;i += 8) {
+    __m256i abcdefgh, todo[2];
+    __m256d x, f2, f1, f0;
+    __m128i if2, if1, if0;
+    int j;
+
+    abcdefgh = _mm256_loadu_si256((__m256i *) c);
+    c += 32;
+
+    todo[0] = _mm256_unpacklo_epi32(abcdefgh,alpha_top);
+    todo[1] = _mm256_unpackhi_epi32(abcdefgh,alpha_top);
+
+    for (j = 0;j < 2;++j) {
+      x = *(__m256d *) &todo[j];
+      x -= alpha;
+
+      /* x is f0 + f1*1536 + f2*1536^2 */
+      /* with each f between 0 and 1530 */
+
+      f2 = x * _mm256_set1_pd(0.00000042385525173611114052197733521876177320564238470979034900665283203125);
+      f2 = floor(f2);
+      x -= f2 * _mm256_set1_pd(2359296.0);
+
+      f1 = x * _mm256_set1_pd(0.00065104166666666673894681149903362893383018672466278076171875);
+      f1 = floor(f1);
+      x -= f1 * _mm256_set1_pd(1536.0);
+
+      f0 = x;
+
+      f2 -= _mm256_set1_pd(1531.0) * floor(f2 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875));
+      f1 -= _mm256_set1_pd(1531.0) * floor(f1 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875));
+      f0 -= _mm256_set1_pd(1531.0) * floor(f0 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875));
+
+      f2 *= _mm256_set1_pd(3.0); f2 -= _mm256_set1_pd(2295.0);
+      f1 *= _mm256_set1_pd(3.0); f1 -= _mm256_set1_pd(2295.0);
+      f0 *= _mm256_set1_pd(3.0); f0 -= _mm256_set1_pd(2295.0);
+
+      if2 = _mm256_cvtpd_epi32(f2); /* a2 b2 e2 f2 */
+      if1 = _mm256_cvtpd_epi32(f1); /* a1 b1 e1 f1 */
+      if0 = _mm256_cvtpd_epi32(f0); /* a0 b0 e0 f0 */
+
+      f[6*j + 0] = _mm_extract_epi32(if0,0);
+      f[6*j + 1] = _mm_extract_epi32(if1,0);
+      f[6*j + 2] = _mm_extract_epi32(if2,0);
+      f[6*j + 3] = _mm_extract_epi32(if0,1);
+      f[6*j + 4] = _mm_extract_epi32(if1,1);
+      f[6*j + 5] = _mm_extract_epi32(if2,1);
+
+      f[6*j + 12] = _mm_extract_epi32(if0,2);
+      f[6*j + 13] = _mm_extract_epi32(if1,2);
+      f[6*j + 14] = _mm_extract_epi32(if2,2);
+      f[6*j + 15] = _mm_extract_epi32(if0,3);
+      f[6*j + 16] = _mm_extract_epi32(if1,3);
+      f[6*j + 17] = _mm_extract_epi32(if2,3);
+    }
+
+    f += 24;
+  }
+
+  for (i = 248;i < 253;++i) {
+    c0 = *c++;
+    c1 = *c++;
+    c2 = *c++;
+    c3 = *c++;
+
+    /* f0 + f1*1536 + f2*1536^2 */
+    /* = c0 + c1*256 + c2*256^2 + c3*256^3 */
+    /* with each f between 0 and 1530 */
+
+    /* f2 = (64/9)c3 + (1/36)c2 + (1/9216)c1 + (1/2359296)c0 - [0,0.99675] */
+    /* claim: 2^21 f2 < x < 2^21(f2+1) */
+    /* where x = 14913081*c3 + 58254*c2 + 228*(c1+2) */
+    /* proof: x - 2^21 f2 = 456 - (8/9)c0 + (4/9)c1 - (2/9)c2 + (1/9)c3 + 2^21 [0,0.99675] */
+    /* at least 456 - (8/9)255 - (2/9)255 > 0 */
+    /* at most 456 + (4/9)255 + (1/9)255 + 2^21 0.99675 < 2^21 */
+    f2 = (14913081*c3 + 58254*c2 + 228*(c1+2)) >> 21;
+
+    c2 += c3 << 8;
+    c2 -= (f2 * 9) << 2;
+    /* f0 + f1*1536 */
+    /* = c0 + c1*256 + c2*256^2 */
+    /* c2 <= 35 = floor((1530+1530*1536)/256^2) */
+    /* f1 = (128/3)c2 + (1/6)c1 + (1/1536)c0 - (1/1536)f0 */
+    /* claim: 2^21 f1 < x < 2^21(f1+1) */
+    /* where x = 89478485*c2 + 349525*c1 + 1365*(c0+1) */
+    /* proof: x - 2^21 f1 = 1365 - (1/3)c2 - (1/3)c1 - (1/3)c0 + (4096/3)f0 */
+    /* at least 1365 - (1/3)35 - (1/3)255 - (1/3)255 > 0 */
+    /* at most 1365 + (4096/3)1530 < 2^21 */
+    f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
+
+    c1 += c2 << 8;
+    c1 -= (f1 * 3) << 1;
+
+    c0 += c1 << 8;
+    f0 = c0;
+
+    *f++ = modq_freeze(f0 * 3 + q - qshift);
+    *f++ = modq_freeze(f1 * 3 + q - qshift);
+    *f++ = modq_freeze(f2 * 3 + q - qshift);
+  }
+
+  c0 = *c++;
+  c1 = *c++;
+  c2 = *c++;
+
+  f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
+
+  c1 += c2 << 8;
+  c1 -= (f1 * 3) << 1;
+
+  c0 += c1 << 8;
+  f0 = c0;
+
+  *f++ = modq_freeze(f0 * 3 + q - qshift);
+  *f++ = modq_freeze(f1 * 3 + q - qshift);
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.c
@ -0,0 +1,45 @@
+#include <immintrin.h>
+#include "params.h"
+#include "small.h"
+
+/* XXX: these functions rely on p mod 4 = 1 */
+
+/* all coefficients in -1, 0, 1 */
+void small_encode(unsigned char *c,const small *f)
+{
+  small c0;
+  int i;
+
+  for (i = 0;i < p/4;++i) {
+    c0 = *f++ + 1;
+    c0 += (*f++ + 1) << 2;
+    c0 += (*f++ + 1) << 4;
+    c0 += (*f++ + 1) << 6;
+    *c++ = c0;
+  }
+  c0 = *f++ + 1;
+  *c++ = c0;
+}
+
+void small_decode(small *f,const unsigned char *c)
+{
+  unsigned char c0;
+  int i;
+
+  for (i = 0;i < p/4;++i) {
+    c0 = *c++;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1;
+  }
+  c0 = *c++;
+  *f++ = ((small) (c0 & 3)) - 1;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+  *f++ = 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.h
@ -0,0 +1,20 @@
+#ifndef small_h
+#define small_h
+
+#include "crypto_int8.h"
+
+typedef crypto_int8 small;
+
+#define small_encode crypto_kem_sntrup4591761_avx_small_encode
+extern void small_encode(unsigned char *,const small *);
+
+#define small_decode crypto_kem_sntrup4591761_avx_small_decode
+extern void small_decode(small *,const unsigned char *);
+
+#define small_random crypto_kem_sntrup4591761_avx_small_random
+extern void small_random(small *);
+
+#define small_random_weightw crypto_kem_sntrup4591761_avx_small_random_weightw
+extern void small_random_weightw(small *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.c
@ -0,0 +1,32 @@
+#include <immintrin.h>
+#include "swap.h"
+
+void swap(void *x,void *y,int bytes,int mask)
+{
+  char c = mask;
+  __m256i maskvec = _mm256_set1_epi32(mask);
+  
+  while (bytes >= 32) {
+    __m256i xi = _mm256_loadu_si256(x);
+    __m256i yi = _mm256_loadu_si256(y);
+    __m256i xinew = _mm256_blendv_epi8(xi,yi,maskvec);
+    __m256i yinew = _mm256_blendv_epi8(yi,xi,maskvec);
+    _mm256_storeu_si256(x,xinew);
+    _mm256_storeu_si256(y,yinew);
+    x = 32 + (char *) x;
+    y = 32 + (char *) y;
+    bytes -= 32;
+  }
+  while (bytes > 0) {
+    char xi = *(char *) x;
+    char yi = *(char *) y;
+    char t = c & (xi ^ yi);
+    xi ^= t;
+    yi ^= t;
+    *(char *) x = xi;
+    *(char *) y = yi;
+    ++x;
+    ++y;
+    --bytes;
+  }
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.h
@ -0,0 +1,7 @@
+#ifndef swap_h
+#define swap_h
+
+#define swap crypto_kem_sntrup4591761_avx_swap
+extern void swap(void *,void *,int,int);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/weight.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/weight.c
@ -0,0 +1,28 @@
+#include <immintrin.h>
+#include "params.h"
+#include "r3.h"
+#include "crypto_uint16.h"
+#include "crypto_int32.h"
+
+int r3_weightw_mask(const small *r)
+{
+  int weight;
+  int i;
+  __m256i tally = _mm256_set1_epi32(0);
+
+  for (i = 0;i < 768;i += 16) {
+    __m256i x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i *) &r[i]));
+    x &= _mm256_set1_epi32(0x00010001);
+    tally = _mm256_add_epi16(tally,x);
+  }
+
+  tally = _mm256_hadd_epi16(tally,tally);
+  tally = _mm256_hadd_epi16(tally,tally);
+  tally = _mm256_hadd_epi16(tally,tally);
+
+  weight = _mm_extract_epi16(_mm256_extracti128_si256(tally,0),0) + _mm_extract_epi16(_mm256_extracti128_si256(tally,1),0);
+
+  weight -= w;
+
+  return (-(crypto_int32) (crypto_uint16) weight) >> 30;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumbig
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumbig
@ -0,0 +1 @@
+83705d49d3a8cb2e16028b86ea6bd44a969b51c2e5114ee02767cf2ddf1aac26
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumsmall
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumsmall
@ -0,0 +1 @@
+336647fe0ed2f6e0d4b15d05e68faec67a81312d769ad3cbee8e0f2de83c2dde
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/description
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/description
@ -0,0 +1 @@
+Streamlined NTRU Prime 4591^761
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/designers
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/designers
@ -0,0 +1,5 @@
+Alphabetical order:
+Daniel J. Bernstein
+Chitchanok Chuengsatiansup
+Tanja Lange
+Christine van Vredendaal
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/README
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/README
@ -0,0 +1,32 @@
+This is a reference implementation of Streamlined NTRU Prime 4591^761.
+This implementation is designed primarily for clarity, subject to the
+following constraints:
+
+   * The implementation is written in C. The Sage implementation in the
+     NTRU Prime paper is considerably more concise (and compatible).
+
+   * The implementation avoids data-dependent branches and array
+     indices. For example, conditional swaps are computed by arithmetic
+     rather than by branches.
+
+   * The implementation avoids other C operations that often take
+     variable time. For example, divisions by 3 are computed via
+     multiplications and shifts.
+     
+This implementation does _not_ sacrifice clarity for speed.
+
+This implementation has not yet been reviewed for correctness or for
+constant-time behavior. It does pass various tests and has no known
+bugs, but there are at least some platforms where multiplications take
+variable time, and fixing this requires platform-specific effort; see
+https://www.bearssl.org/ctmul.html and http://repository.tue.nl/800603.
+
+This implementation allows "benign malleability" of ciphertexts, as
+defined in http://www.shoup.net/papers/iso-2_1.pdf. Specifically, each
+32-bit ciphertext word encodes three integers between 0 and 1530; if
+larger integers appear then they are silently reduced modulo 1531.
+Similar comments apply to public keys.
+
+There is a separate "avx" implementation where similar comments apply,
+except that "avx" _does_ sacrifice clarity for speed on CPUs with AVX2
+instructions.
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/api.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/api.h
@ -0,0 +1,4 @@
+#define CRYPTO_SECRETKEYBYTES 1600
+#define CRYPTO_PUBLICKEYBYTES 1218
+#define CRYPTO_CIPHERTEXTBYTES 1047
+#define CRYPTO_BYTES 32
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/dec.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/dec.c
@ -0,0 +1,71 @@
+#ifdef KAT
+#include <stdio.h>
+#endif
+
+#include "params.h"
+#include "small.h"
+#include "mod3.h"
+#include "rq.h"
+#include "r3.h"
+#include "crypto_hash_sha512.h"
+#include "crypto_verify_32.h"
+#include "crypto_kem.h"
+
+int crypto_kem_dec(
+  unsigned char *k,
+  const unsigned char *cstr,
+  const unsigned char *sk
+)
+{
+  small f[p];
+  modq h[p];
+  small grecip[p];
+  modq c[p];
+  modq t[p];
+  small t3[p];
+  small r[p];
+  modq hr[p];
+  unsigned char rstr[small_encode_len];
+  unsigned char hash[64];
+  int i;
+  int result = 0;
+  int weight;
+
+  small_decode(f,sk);
+  small_decode(grecip,sk + small_encode_len);
+  rq_decode(h,sk + 2 * small_encode_len);
+
+  rq_decoderounded(c,cstr + 32);
+
+  rq_mult(t,c,f);
+  for (i = 0;i < p;++i) t3[i] = mod3_freeze(modq_freeze(3*t[i]));
+
+  r3_mult(r,t3,grecip);
+
+#ifdef KAT
+  {
+    int j;
+    printf("decrypt r:");
+    for (j = 0;j < p;++j)
+      if (r[j] == 1) printf(" +%d",j);
+      else if (r[j] == -1) printf(" -%d",j);
+    printf("\n");
+  }
+#endif
+
+  weight = 0;
+  for (i = 0;i < p;++i) weight += (1 & r[i]);
+  weight -= w;
+  result |= modq_nonzero_mask(weight); /* XXX: puts limit on p */
+
+  rq_mult(hr,h,r);
+  rq_round3(hr,hr);
+  for (i = 0;i < p;++i) result |= modq_nonzero_mask(hr[i] - c[i]);
+
+  small_encode(rstr,r);
+  crypto_hash_sha512(hash,rstr,sizeof rstr);
+  result |= crypto_verify_32(hash,cstr);
+
+  for (i = 0;i < 32;++i) k[i] = (hash[32 + i] & ~result);
+  return result;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/enc.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/enc.c
@ -0,0 +1,49 @@
+#ifdef KAT
+#include <stdio.h>
+#endif
+
+#include <string.h>
+#include "params.h"
+#include "small.h"
+#include "rq.h"
+#include "crypto_hash_sha512.h"
+#include "crypto_kem.h"
+
+int crypto_kem_enc(
+  unsigned char *cstr,
+  unsigned char *k,
+  const unsigned char *pk
+)
+{
+  small r[p];
+  modq h[p];
+  modq c[p];
+  unsigned char rstr[small_encode_len];
+  unsigned char hash[64];
+
+  small_random_weightw(r);
+
+#ifdef KAT
+  {
+    int i;
+    printf("encrypt r:");
+    for (i = 0;i < p;++i)
+      if (r[i] == 1) printf(" +%d",i);
+      else if (r[i] == -1) printf(" -%d",i);
+    printf("\n");
+  }
+#endif
+
+  small_encode(rstr,r);
+  crypto_hash_sha512(hash,rstr,sizeof rstr);
+
+  rq_decode(h,pk);
+  rq_mult(c,h,r);
+  rq_round3(c,c);
+
+  memcpy(k,hash + 32,32);
+  memcpy(cstr,hash,32);
+  rq_encoderounded(cstr + 32,c);
+
+  return 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/implementors
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/implementors
@ -0,0 +1,5 @@
+Alphabetical order:
+Daniel J. Bernstein
+Chitchanok Chuengsatiansup
+Tanja Lange
+Christine van Vredendaal
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.c
@ -0,0 +1,35 @@
+#include "int32_sort.h"
+#include "crypto_uint32.h"
+
+static void minmax(crypto_int32 *x,crypto_int32 *y)
+{
+  crypto_uint32 xi = *x;
+  crypto_uint32 yi = *y;
+  crypto_uint32 xy = xi ^ yi;
+  crypto_uint32 c = yi - xi;
+  c ^= xy & (c ^ yi);
+  c >>= 31;
+  c = -c;
+  c &= xy;
+  *x = xi ^ c;
+  *y = yi ^ c;
+}
+
+void int32_sort(crypto_int32 *x,int n)
+{
+  int top,p,q,i;
+
+  if (n < 2) return;
+  top = 1;
+  while (top < n - top) top += top;
+
+  for (p = top;p > 0;p >>= 1) {
+    for (i = 0;i < n - p;++i)
+      if (!(i & p))
+        minmax(x + i,x + i + p);
+    for (q = top;q > p;q >>= 1)
+      for (i = 0;i < n - q;++i)
+        if (!(i & p))
+          minmax(x + i + p,x + i + q);
+  }
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.h
@ -0,0 +1,9 @@
+#ifndef int32_sort_h
+#define int32_sort_h
+
+#include "crypto_int32.h"
+
+#define int32_sort crypto_kem_sntrup4591761_ref_int32_sort
+extern void int32_sort(crypto_int32 *,int);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/keypair.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/keypair.c
@ -0,0 +1,39 @@
+#include <string.h>
+#include "modq.h"
+#include "params.h"
+#include "r3.h"
+#include "small.h"
+#include "rq.h"
+#include "crypto_kem.h"
+
+#if crypto_kem_PUBLICKEYBYTES != rq_encode_len
+#error "crypto_kem_PUBLICKEYBYTES must match rq_encode_len"
+#endif
+#if crypto_kem_SECRETKEYBYTES != rq_encode_len + 2 * small_encode_len
+#error "crypto_kem_SECRETKEYBYTES must match rq_encode_len + 2 * small_encode_len"
+#endif
+
+int crypto_kem_keypair(unsigned char *pk,unsigned char *sk)
+{
+  small g[p];
+  small grecip[p];
+  small f[p];
+  modq f3recip[p];
+  modq h[p];
+
+  do
+    small_random(g);
+  while (r3_recip(grecip,g) != 0);
+
+  small_random_weightw(f);
+  rq_recip3(f3recip,f);
+
+  rq_mult(h,f3recip,g);
+
+  rq_encode(pk,h);
+  small_encode(sk,f);
+  small_encode(sk + small_encode_len,grecip);
+  memcpy(sk + 2 * small_encode_len,pk,rq_encode_len);
+
+  return 0;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/mod3.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/mod3.h
@ -0,0 +1,60 @@
+#ifndef mod3_h
+#define mod3_h
+
+#include "small.h"
+#include "crypto_int32.h"
+
+/* -1 if x is nonzero, 0 otherwise */
+static inline int mod3_nonzero_mask(small x)
+{
+  return -x*x;
+}
+
+/* input between -100000 and 100000 */
+/* output between -1 and 1 */
+static inline small mod3_freeze(crypto_int32 a)
+{
+  a -= 3 * ((10923 * a) >> 15);
+  a -= 3 * ((89478485 * a + 134217728) >> 28);
+  return a;
+}
+
+static inline small mod3_minusproduct(small a,small b,small c)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  crypto_int32 C = c;
+  return mod3_freeze(A - B * C);
+}
+
+static inline small mod3_plusproduct(small a,small b,small c)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  crypto_int32 C = c;
+  return mod3_freeze(A + B * C);
+}
+
+static inline small mod3_product(small a,small b)
+{
+  return a * b;
+}
+
+static inline small mod3_sum(small a,small b)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  return mod3_freeze(A + B);
+}
+
+static inline small mod3_reciprocal(small a1)
+{
+  return a1;
+}
+
+static inline small mod3_quotient(small num,small den)
+{
+  return mod3_product(num,mod3_reciprocal(den));
+}
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/modq.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/modq.h
@ -0,0 +1,92 @@
+#ifndef modq_h
+#define modq_h
+
+#include "crypto_int16.h"
+#include "crypto_int32.h"
+#include "crypto_uint16.h"
+#include "crypto_uint32.h"
+
+typedef crypto_int16 modq;
+
+/* -1 if x is nonzero, 0 otherwise */
+static inline int modq_nonzero_mask(modq x)
+{
+  crypto_int32 r = (crypto_uint16) x;
+  r = -r;
+  r >>= 30;
+  return r;
+}
+
+/* input between -9000000 and 9000000 */
+/* output between -2295 and 2295 */
+static inline modq modq_freeze(crypto_int32 a)
+{
+  a -= 4591 * ((228 * a) >> 20);
+  a -= 4591 * ((58470 * a + 134217728) >> 28);
+  return a;
+}
+
+static inline modq modq_minusproduct(modq a,modq b,modq c)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  crypto_int32 C = c;
+  return modq_freeze(A - B * C);
+}
+
+static inline modq modq_plusproduct(modq a,modq b,modq c)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  crypto_int32 C = c;
+  return modq_freeze(A + B * C);
+}
+
+static inline modq modq_product(modq a,modq b)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  return modq_freeze(A * B);
+}
+
+static inline modq modq_square(modq a)
+{
+  crypto_int32 A = a;
+  return modq_freeze(A * A);
+}
+
+static inline modq modq_sum(modq a,modq b)
+{
+  crypto_int32 A = a;
+  crypto_int32 B = b;
+  return modq_freeze(A + B);
+}
+
+static inline modq modq_reciprocal(modq a1)
+{
+  modq a2 = modq_square(a1);
+  modq a3 = modq_product(a2,a1);
+  modq a4 = modq_square(a2);
+  modq a8 = modq_square(a4);
+  modq a16 = modq_square(a8);
+  modq a32 = modq_square(a16);
+  modq a35 = modq_product(a32,a3);
+  modq a70 = modq_square(a35);
+  modq a140 = modq_square(a70);
+  modq a143 = modq_product(a140,a3);
+  modq a286 = modq_square(a143);
+  modq a572 = modq_square(a286);
+  modq a1144 = modq_square(a572);
+  modq a1147 = modq_product(a1144,a3);
+  modq a2294 = modq_square(a1147);
+  modq a4588 = modq_square(a2294);
+  modq a4589 = modq_product(a4588,a1);
+  return a4589;
+}
+
+static inline modq modq_quotient(modq num,modq den)
+{
+  return modq_product(num,modq_reciprocal(den));
+}
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/params.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/params.h
@ -0,0 +1,14 @@
+#ifndef params_h
+#define params_h
+
+#define q 4591
+/* XXX: also built into modq in various ways */
+
+#define qshift 2295
+#define p 761
+#define w 286
+
+#define rq_encode_len 1218
+#define small_encode_len 191
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3.h
@ -0,0 +1,12 @@
+#ifndef r3_h
+#define r3_h
+
+#include "small.h"
+
+#define r3_mult crypto_kem_sntrup4591761_ref_r3_mult
+extern void r3_mult(small *,const small *,const small *);
+
+#define r3_recip crypto_kem_sntrup4591761_ref_r3_recip
+extern int r3_recip(small *,const small *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_mult.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_mult.c
@ -0,0 +1,31 @@
+#include "params.h"
+#include "mod3.h"
+#include "r3.h"
+
+void r3_mult(small *h,const small *f,const small *g)
+{
+  small fg[p + p - 1];
+  small result;
+  int i, j;
+
+  for (i = 0;i < p;++i) {
+    result = 0;
+    for (j = 0;j <= i;++j)
+      result = mod3_plusproduct(result,f[j],g[i - j]);
+    fg[i] = result;
+  }
+  for (i = p;i < p + p - 1;++i) {
+    result = 0;
+    for (j = i - p + 1;j < p;++j)
+      result = mod3_plusproduct(result,f[j],g[i - j]);
+    fg[i] = result;
+  }
+
+  for (i = p + p - 2;i >= p;--i) {
+    fg[i - p] = mod3_sum(fg[i - p],fg[i]);
+    fg[i - p + 1] = mod3_sum(fg[i - p + 1],fg[i]);
+  }
+
+  for (i = 0;i < p;++i)
+    h[i] = fg[i];
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_recip.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_recip.c
@ -0,0 +1,126 @@
+#include "params.h"
+#include "mod3.h"
+#include "swap.h"
+#include "r3.h"
+
+/* caller must ensure that x-y does not overflow */
+static int smaller_mask(int x,int y)
+{
+  return (x - y) >> 31;
+}
+
+static void vectormod3_product(small *z,int len,const small *x,const small c)
+{
+  int i;
+  for (i = 0;i < len;++i) z[i] = mod3_product(x[i],c);
+}
+
+static void vectormod3_minusproduct(small *z,int len,const small *x,const small *y,const small c)
+{
+  int i;
+  for (i = 0;i < len;++i) z[i] = mod3_minusproduct(x[i],y[i],c);
+}
+
+static void vectormod3_shift(small *z,int len)
+{
+  int i;
+  for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
+  z[0] = 0;
+}
+
+/*
+r = s^(-1) mod m, returning 0, if s is invertible mod m
+or returning -1 if s is not invertible mod m
+r,s are polys of degree <p
+m is x^p-x-1
+*/
+int r3_recip(small *r,const small *s)
+{
+  const int loops = 2*p + 1;
+  int loop;
+  small f[p + 1]; 
+  small g[p + 1]; 
+  small u[loops + 1];
+  small v[loops + 1];
+  small c;
+  int i;
+  int d = p;
+  int e = p;
+  int swapmask;
+
+  for (i = 2;i < p;++i) f[i] = 0;
+  f[0] = -1;
+  f[1] = -1;
+  f[p] = 1;
+  /* generalization: can initialize f to any polynomial m */
+  /* requirements: m has degree exactly p, nonzero constant coefficient */
+
+  for (i = 0;i < p;++i) g[i] = s[i];
+  g[p] = 0;
+
+  for (i = 0;i <= loops;++i) u[i] = 0;
+
+  v[0] = 1;
+  for (i = 1;i <= loops;++i) v[i] = 0;
+
+  loop = 0;
+  for (;;) {
+    /* e == -1 or d + e + loop <= 2*p */
+
+    /* f has degree p: i.e., f[p]!=0 */
+    /* f[i]==0 for i < p-d */
+
+    /* g has degree <=p (so it fits in p+1 coefficients) */
+    /* g[i]==0 for i < p-e */
+
+    /* u has degree <=loop (so it fits in loop+1 coefficients) */
+    /* u[i]==0 for i < p-d */
+    /* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
+
+    /* v has degree <=loop (so it fits in loop+1 coefficients) */
+    /* v[i]==0 for i < p-e */
+    /* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
+
+    if (loop >= loops) break;
+
+    c = mod3_quotient(g[p],f[p]);
+
+    vectormod3_minusproduct(g,p + 1,g,f,c);
+    vectormod3_shift(g,p + 1);
+
+#ifdef SIMPLER
+    vectormod3_minusproduct(v,loops + 1,v,u,c);
+    vectormod3_shift(v,loops + 1);
+#else
+    if (loop < p) {
+      vectormod3_minusproduct(v,loop + 1,v,u,c);
+      vectormod3_shift(v,loop + 2);
+    } else {
+      vectormod3_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
+      vectormod3_shift(v + loop - p,p + 2);
+    }
+#endif
+
+    e -= 1;
+
+    ++loop;
+
+    swapmask = smaller_mask(e,d) & mod3_nonzero_mask(g[p]);
+    swap(&e,&d,sizeof e,swapmask);
+    swap(f,g,(p + 1) * sizeof(small),swapmask);
+
+#ifdef SIMPLER
+    swap(u,v,(loops + 1) * sizeof(small),swapmask);
+#else
+    if (loop < p) {
+      swap(u,v,(loop + 1) * sizeof(small),swapmask);
+    } else {
+      swap(u + loop - p,v + loop - p,(p + 1) * sizeof(small),swapmask);
+    }
+#endif
+  }
+
+  c = mod3_reciprocal(f[p]);
+  vectormod3_product(r,p,u + p,c);
+  return smaller_mask(0,d);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/random32.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/random32.c
@ -0,0 +1,24 @@
+#include "randombytes.h"
+#include "small.h"
+
+#ifdef KAT
+/* NIST KAT generator fails to provide chunk-independence */
+static unsigned char x[4*761];
+static long long pos = 4*761;
+#endif
+
+crypto_int32 small_random32(void)
+{
+#ifdef KAT
+  if (pos == 4*761) {
+    randombytes(x,sizeof x);
+    pos = 0;
+  }
+  pos += 4;
+  return x[pos - 4] + (x[pos - 3] << 8) + (x[pos - 2] << 16) + (x[pos - 1] << 24);
+#else
+  unsigned char x[4];
+  randombytes(x,4);
+  return x[0] + (x[1] << 8) + (x[2] << 16) + (x[3] << 24);
+#endif
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomsmall.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomsmall.c
@ -0,0 +1,14 @@
+#include "params.h"
+#include "randombytes.h"
+#include "crypto_uint32.h"
+#include "small.h"
+
+void small_random(small *g)
+{
+  int i;
+
+  for (i = 0;i < p;++i) {
+    crypto_uint32 r = small_random32();
+    g[i] = (small) (((1073741823 & r) * 3) >> 30) - 1;
+  }
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomweightw.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomweightw.c
@ -0,0 +1,16 @@
+#include "params.h"
+#include "randombytes.h"
+#include "int32_sort.h"
+#include "small.h"
+
+void small_random_weightw(small *f)
+{
+  crypto_int32 r[p];
+  int i;
+
+  for (i = 0;i < p;++i) r[i] = small_random32();
+  for (i = 0;i < w;++i) r[i] &= -2;
+  for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1;
+  int32_sort(r,p);
+  for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.c
@ -0,0 +1,128 @@
+#include "params.h"
+#include "crypto_uint32.h"
+#include "rq.h"
+
+void rq_encode(unsigned char *c,const modq *f)
+{
+  crypto_int32 f0, f1, f2, f3, f4;
+  int i;
+
+  for (i = 0;i < p/5;++i) {
+    f0 = *f++ + qshift;
+    f1 = *f++ + qshift;
+    f2 = *f++ + qshift;
+    f3 = *f++ + qshift;
+    f4 = *f++ + qshift;
+    /* now want f0 + 6144*f1 + ... as a 64-bit integer */
+    f1 *= 3;
+    f2 *= 9;
+    f3 *= 27;
+    f4 *= 81;
+    /* now want f0 + f1<<11 + f2<<22 + f3<<33 + f4<<44 */
+    f0 += f1 << 11;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    f0 += f2 << 6;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    f0 += f3 << 1;
+    *c++ = f0; f0 >>= 8;
+    f0 += f4 << 4;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0;
+  }
+  /* XXX: using p mod 5 = 1 */
+  f0 = *f++ + qshift;
+  *c++ = f0; f0 >>= 8;
+  *c++ = f0;
+}
+
+void rq_decode(modq *f,const unsigned char *c)
+{
+  crypto_uint32 c0, c1, c2, c3, c4, c5, c6, c7;
+  crypto_uint32 f0, f1, f2, f3, f4;
+  int i;
+
+  for (i = 0;i < p/5;++i) {
+    c0 = *c++;
+    c1 = *c++;
+    c2 = *c++;
+    c3 = *c++;
+    c4 = *c++;
+    c5 = *c++;
+    c6 = *c++;
+    c7 = *c++;
+
+    /* f0 + f1*6144 + f2*6144^2 + f3*6144^3 + f4*6144^4 */
+    /* = c0 + c1*256 + ... + c6*256^6 + c7*256^7 */
+    /* with each f between 0 and 4590 */
+
+    c6 += c7 << 8;
+    /* c6 <= 23241 = floor(4591*6144^4/2^48) */
+    /* f4 = (16/81)c6 + (1/1296)(c5+[0,1]) - [0,0.75] */
+    /* claim: 2^19 f4 < x < 2^19(f4+1) */
+    /* where x = 103564 c6 + 405(c5+1) */
+    /* proof: x - 2^19 f4 = (76/81)c6 + (37/81)c5 + 405 - (32768/81)[0,1] + 2^19[0,0.75] */
+    /* at least 405 - 32768/81 > 0 */
+    /* at most (76/81)23241 + (37/81)255 + 405 + 2^19 0.75 < 2^19 */
+    f4 = (103564*c6 + 405*(c5+1)) >> 19;
+
+    c5 += c6 << 8;
+    c5 -= (f4 * 81) << 4;
+    c4 += c5 << 8;
+
+    /* f0 + f1*6144 + f2*6144^2 + f3*6144^3 */
+    /* = c0 + c1*256 + c2*256^2 + c3*256^3 + c4*256^4 */
+    /* c4 <= 247914 = floor(4591*6144^3/2^32) */
+    /* f3 = (1/54)(c4+[0,1]) - [0,0.75] */
+    /* claim: 2^19 f3 < x < 2^19(f3+1) */
+    /* where x = 9709(c4+2) */
+    /* proof: x - 2^19 f3 = 19418 - (1/27)c4 - (262144/27)[0,1] + 2^19[0,0.75] */
+    /* at least 19418 - 247914/27 - 262144/27 > 0 */
+    /* at most 19418 + 2^19 0.75 < 2^19 */
+    f3 = (9709*(c4+2)) >> 19;
+
+    c4 -= (f3 * 27) << 1;
+    c3 += c4 << 8;
+    /* f0 + f1*6144 + f2*6144^2 */
+    /* = c0 + c1*256 + c2*256^2 + c3*256^3 */
+    /* c3 <= 10329 = floor(4591*6144^2/2^24) */
+    /* f2 = (4/9)c3 + (1/576)c2 + (1/147456)c1 + (1/37748736)c0 - [0,0.75] */
+    /* claim: 2^19 f2 < x < 2^19(f2+1) */
+    /* where x = 233017 c3 + 910(c2+2) */
+    /* proof: x - 2^19 f2 = 1820 + (1/9)c3 - (2/9)c2 - (32/9)c1 - (1/72)c0 + 2^19[0,0.75] */
+    /* at least 1820 - (2/9)255 - (32/9)255 - (1/72)255 > 0 */
+    /* at most 1820 + (1/9)10329 + 2^19 0.75 < 2^19 */
+    f2 = (233017*c3 + 910*(c2+2)) >> 19;
+
+    c2 += c3 << 8;
+    c2 -= (f2 * 9) << 6;
+    c1 += c2 << 8;
+    /* f0 + f1*6144 */
+    /* = c0 + c1*256 */
+    /* c1 <= 110184 = floor(4591*6144/2^8) */
+    /* f1 = (1/24)c1 + (1/6144)c0 - (1/6144)f0 */
+    /* claim: 2^19 f1 < x < 2^19(f1+1) */
+    /* where x = 21845(c1+2) + 85 c0 */
+    /* proof: x - 2^19 f1 = 43690 - (1/3)c1 - (1/3)c0 + 2^19 [0,0.75] */
+    /* at least 43690 - (1/3)110184 - (1/3)255 > 0 */
+    /* at most 43690 + 2^19 0.75 < 2^19 */
+    f1 = (21845*(c1+2) + 85*c0) >> 19;
+
+    c1 -= (f1 * 3) << 3;
+    c0 += c1 << 8;
+    f0 = c0;
+
+    *f++ = modq_freeze(f0 + q - qshift);
+    *f++ = modq_freeze(f1 + q - qshift);
+    *f++ = modq_freeze(f2 + q - qshift);
+    *f++ = modq_freeze(f3 + q - qshift);
+    *f++ = modq_freeze(f4 + q - qshift);
+  }
+
+  c0 = *c++;
+  c1 = *c++;
+  c0 += c1 << 8;
+  *f++ = modq_freeze(c0 + q - qshift);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.h
@ -0,0 +1,28 @@
+#ifndef rq_h
+#define rq_h
+
+#include "modq.h"
+#include "small.h"
+
+#define rq_encode crypto_kem_sntrup4591761_ref_rq_encode
+extern void rq_encode(unsigned char *,const modq *);
+
+#define rq_decode crypto_kem_sntrup4591761_ref_rq_decode
+extern void rq_decode(modq *,const unsigned char *);
+
+#define rq_encoderounded crypto_kem_sntrup4591761_ref_rq_encoderounded
+extern void rq_encoderounded(unsigned char *,const modq *);
+
+#define rq_decoderounded crypto_kem_sntrup4591761_ref_rq_decoderounded
+extern void rq_decoderounded(modq *,const unsigned char *);
+
+#define rq_round3 crypto_kem_sntrup4591761_ref_rq_round
+extern void rq_round3(modq *,const modq *);
+
+#define rq_mult crypto_kem_sntrup4591761_ref_rq_mult
+extern void rq_mult(modq *,const modq *,const small *);
+
+#define rq_recip3 crypto_kem_sntrup4591761_ref_rq_recip3
+int rq_recip3(modq *,const small *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_mult.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_mult.c
@ -0,0 +1,30 @@
+#include "params.h"
+#include "rq.h"
+
+void rq_mult(modq *h,const modq *f,const small *g)
+{
+  modq fg[p + p - 1];
+  modq result;
+  int i, j;
+
+  for (i = 0;i < p;++i) {
+    result = 0;
+    for (j = 0;j <= i;++j)
+      result = modq_plusproduct(result,f[j],g[i - j]);
+    fg[i] = result;
+  }
+  for (i = p;i < p + p - 1;++i) {
+    result = 0;
+    for (j = i - p + 1;j < p;++j)
+      result = modq_plusproduct(result,f[j],g[i - j]);
+    fg[i] = result;
+  }
+
+  for (i = p + p - 2;i >= p;--i) {
+    fg[i - p] = modq_sum(fg[i - p],fg[i]);
+    fg[i - p + 1] = modq_sum(fg[i - p + 1],fg[i]);
+  }
+
+  for (i = 0;i < p;++i)
+    h[i] = fg[i];
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_recip3.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_recip3.c
@ -0,0 +1,125 @@
+#include "params.h"
+#include "swap.h"
+#include "rq.h"
+
+/* caller must ensure that x-y does not overflow */
+static int smaller_mask(int x,int y)
+{
+  return (x - y) >> 31;
+}
+
+static void vectormodq_product(modq *z,int len,const modq *x,const modq c)
+{
+  int i;
+  for (i = 0;i < len;++i) z[i] = modq_product(x[i],c);
+}
+
+static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,const modq c)
+{
+  int i;
+  for (i = 0;i < len;++i) z[i] = modq_minusproduct(x[i],y[i],c);
+}
+
+static void vectormodq_shift(modq *z,int len)
+{
+  int i;
+  for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
+  z[0] = 0;
+}
+
+/*
+r = (3s)^(-1) mod m, returning 0, if s is invertible mod m
+or returning -1 if s is not invertible mod m
+r,s are polys of degree <p
+m is x^p-x-1
+*/
+int rq_recip3(modq *r,const small *s)
+{
+  const int loops = 2*p + 1;
+  int loop;
+  modq f[p + 1]; 
+  modq g[p + 1]; 
+  modq u[loops + 1];
+  modq v[loops + 1];
+  modq c;
+  int i;
+  int d = p;
+  int e = p;
+  int swapmask;
+
+  for (i = 2;i < p;++i) f[i] = 0;
+  f[0] = -1;
+  f[1] = -1;
+  f[p] = 1;
+  /* generalization: can initialize f to any polynomial m */
+  /* requirements: m has degree exactly p, nonzero constant coefficient */
+
+  for (i = 0;i < p;++i) g[i] = 3 * s[i];
+  g[p] = 0;
+
+  for (i = 0;i <= loops;++i) u[i] = 0;
+
+  v[0] = 1;
+  for (i = 1;i <= loops;++i) v[i] = 0;
+
+  loop = 0;
+  for (;;) {
+    /* e == -1 or d + e + loop <= 2*p */
+
+    /* f has degree p: i.e., f[p]!=0 */
+    /* f[i]==0 for i < p-d */
+
+    /* g has degree <=p (so it fits in p+1 coefficients) */
+    /* g[i]==0 for i < p-e */
+
+    /* u has degree <=loop (so it fits in loop+1 coefficients) */
+    /* u[i]==0 for i < p-d */
+    /* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
+
+    /* v has degree <=loop (so it fits in loop+1 coefficients) */
+    /* v[i]==0 for i < p-e */
+    /* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
+
+    if (loop >= loops) break;
+
+    c = modq_quotient(g[p],f[p]);
+
+    vectormodq_minusproduct(g,p + 1,g,f,c);
+    vectormodq_shift(g,p + 1);
+
+#ifdef SIMPLER
+    vectormodq_minusproduct(v,loops + 1,v,u,c);
+    vectormodq_shift(v,loops + 1);
+#else
+    if (loop < p) {
+      vectormodq_minusproduct(v,loop + 1,v,u,c);
+      vectormodq_shift(v,loop + 2);
+    } else {
+      vectormodq_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
+      vectormodq_shift(v + loop - p,p + 2);
+    }
+#endif
+
+    e -= 1;
+
+    ++loop;
+
+    swapmask = smaller_mask(e,d) & modq_nonzero_mask(g[p]);
+    swap(&e,&d,sizeof e,swapmask);
+    swap(f,g,(p + 1) * sizeof(modq),swapmask);
+
+#ifdef SIMPLER
+    swap(u,v,(loops + 1) * sizeof(modq),swapmask);
+#else
+    if (loop < p) {
+      swap(u,v,(loop + 1) * sizeof(modq),swapmask);
+    } else {
+      swap(u + loop - p,v + loop - p,(p + 1) * sizeof(modq),swapmask);
+    }
+#endif
+  }
+
+  c = modq_reciprocal(f[p]);
+  vectormodq_product(r,p,u + p,c);
+  return smaller_mask(0,d);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_round3.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_round3.c
@ -0,0 +1,10 @@
+#include "params.h"
+#include "rq.h"
+
+void rq_round3(modq *h,const modq *f)
+{
+  int i;
+
+  for (i = 0;i < p;++i)
+    h[i] = ((21846 * (f[i] + 2295) + 32768) >> 16) * 3 - 2295;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_rounded.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_rounded.c
@ -0,0 +1,101 @@
+#include "params.h"
+#include "crypto_uint32.h"
+#include "rq.h"
+
+void rq_encoderounded(unsigned char *c,const modq *f)
+{
+  crypto_int32 f0, f1, f2;
+  int i;
+
+  for (i = 0;i < p/3;++i) {
+    f0 = *f++ + qshift;
+    f1 = *f++ + qshift;
+    f2 = *f++ + qshift;
+    f0 = (21846 * f0) >> 16;
+    f1 = (21846 * f1) >> 16;
+    f2 = (21846 * f2) >> 16;
+    /* now want f0 + f1*1536 + f2*1536^2 as a 32-bit integer */
+    f2 *= 3;
+    f1 += f2 << 9;
+    f1 *= 3;
+    f0 += f1 << 9;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0; f0 >>= 8;
+    *c++ = f0;
+  }
+  /* XXX: using p mod 3 = 2 */
+  f0 = *f++ + qshift;
+  f1 = *f++ + qshift;
+  f0 = (21846 * f0) >> 16;
+  f1 = (21846 * f1) >> 16;
+  f1 *= 3;
+  f0 += f1 << 9;
+  *c++ = f0; f0 >>= 8;
+  *c++ = f0; f0 >>= 8;
+  *c++ = f0;
+}
+
+void rq_decoderounded(modq *f,const unsigned char *c)
+{
+  crypto_uint32 c0, c1, c2, c3;
+  crypto_uint32 f0, f1, f2;
+  int i;
+
+  for (i = 0;i < p/3;++i) {
+    c0 = *c++;
+    c1 = *c++;
+    c2 = *c++;
+    c3 = *c++;
+
+    /* f0 + f1*1536 + f2*1536^2 */
+    /* = c0 + c1*256 + c2*256^2 + c3*256^3 */
+    /* with each f between 0 and 1530 */
+
+    /* f2 = (64/9)c3 + (1/36)c2 + (1/9216)c1 + (1/2359296)c0 - [0,0.99675] */
+    /* claim: 2^21 f2 < x < 2^21(f2+1) */
+    /* where x = 14913081*c3 + 58254*c2 + 228*(c1+2) */
+    /* proof: x - 2^21 f2 = 456 - (8/9)c0 + (4/9)c1 - (2/9)c2 + (1/9)c3 + 2^21 [0,0.99675] */
+    /* at least 456 - (8/9)255 - (2/9)255 > 0 */
+    /* at most 456 + (4/9)255 + (1/9)255 + 2^21 0.99675 < 2^21 */
+    f2 = (14913081*c3 + 58254*c2 + 228*(c1+2)) >> 21;
+
+    c2 += c3 << 8;
+    c2 -= (f2 * 9) << 2;
+    /* f0 + f1*1536 */
+    /* = c0 + c1*256 + c2*256^2 */
+    /* c2 <= 35 = floor((1530+1530*1536)/256^2) */
+    /* f1 = (128/3)c2 + (1/6)c1 + (1/1536)c0 - (1/1536)f0 */
+    /* claim: 2^21 f1 < x < 2^21(f1+1) */
+    /* where x = 89478485*c2 + 349525*c1 + 1365*(c0+1) */
+    /* proof: x - 2^21 f1 = 1365 - (1/3)c2 - (1/3)c1 - (1/3)c0 + (4096/3)f0 */
+    /* at least 1365 - (1/3)35 - (1/3)255 - (1/3)255 > 0 */
+    /* at most 1365 + (4096/3)1530 < 2^21 */
+    f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
+
+    c1 += c2 << 8;
+    c1 -= (f1 * 3) << 1;
+
+    c0 += c1 << 8;
+    f0 = c0;
+
+    *f++ = modq_freeze(f0 * 3 + q - qshift);
+    *f++ = modq_freeze(f1 * 3 + q - qshift);
+    *f++ = modq_freeze(f2 * 3 + q - qshift);
+  }
+
+  c0 = *c++;
+  c1 = *c++;
+  c2 = *c++;
+
+  f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21;
+
+  c1 += c2 << 8;
+  c1 -= (f1 * 3) << 1;
+
+  c0 += c1 << 8;
+  f0 = c0;
+
+  *f++ = modq_freeze(f0 * 3 + q - qshift);
+  *f++ = modq_freeze(f1 * 3 + q - qshift);
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.c
@ -0,0 +1,37 @@
+#include "params.h"
+#include "small.h"
+
+/* XXX: these functions rely on p mod 4 = 1 */
+
+/* all coefficients in -1, 0, 1 */
+void small_encode(unsigned char *c,const small *f)
+{
+  small c0;
+  int i;
+
+  for (i = 0;i < p/4;++i) {
+    c0 = *f++ + 1;
+    c0 += (*f++ + 1) << 2;
+    c0 += (*f++ + 1) << 4;
+    c0 += (*f++ + 1) << 6;
+    *c++ = c0;
+  }
+  c0 = *f++ + 1;
+  *c++ = c0;
+}
+
+void small_decode(small *f,const unsigned char *c)
+{
+  unsigned char c0;
+  int i;
+
+  for (i = 0;i < p/4;++i) {
+    c0 = *c++;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
+    *f++ = ((small) (c0 & 3)) - 1;
+  }
+  c0 = *c++;
+  *f++ = ((small) (c0 & 3)) - 1;
+}
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.h
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.h
@ -0,0 +1,24 @@
+#ifndef small_h
+#define small_h
+
+#include "crypto_int8.h"
+#include "crypto_int32.h"
+
+typedef crypto_int8 small;
+
+#define small_encode crypto_kem_sntrup4591761_ref_small_encode
+extern void small_encode(unsigned char *,const small *);
+
+#define small_decode crypto_kem_sntrup4591761_ref_small_decode
+extern void small_decode(small *,const unsigned char *);
+
+#define small_random32 crypto_kem_sntrup4591761_ref_small_random32
+extern crypto_int32 small_random32(void);
+
+#define small_random crypto_kem_sntrup4591761_ref_small_random
+extern void small_random(small *);
+
+#define small_random_weightw crypto_kem_sntrup4591761_ref_small_random_weightw
+extern void small_random_weightw(small *);
+
+#endif
--- a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/swap.c
+++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/swap.c
@ -0,0 +1,19 @@
+#include "swap.h"
+
+void swap(void *x,void *y,int bytes,int mask)
+{
+  int i;
+  char xi, yi, c, t;
+
+  c = mask;
+  
+  for (i = 0;i < bytes;++i) {
+    xi = i[(char *) x];
+    yi = i[(char *) y];
+    t = c & (xi ^ yi);
+    xi ^= t;
+    yi ^= t;
+    i[(char *) x] = xi;
+    i[(char *) y] = yi;
+  }
+}
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`bcc60c85ac6ca2dbbe244878ba9b62019560516e8377aecd890c737bf5dcb05f`
				`@ -0,0 +1 @@`
				`a13b63e4929ab2ab97f7889f071245113ddd919bdaf1c883e12cd80fdf4f9e3e`
				`@ -0,0 +1 @@`
				`83705d49d3a8cb2e16028b86ea6bd44a969b51c2e5114ee02767cf2ddf1aac26`
				`@ -0,0 +1 @@`
				`336647fe0ed2f6e0d4b15d05e68faec67a81312d769ad3cbee8e0f2de83c2dde`