From a60c4b0bef6dce19634172bfbfc45b3df53ca169 Mon Sep 17 00:00:00 2001 From: Jeff Becker Date: Mon, 13 Aug 2018 09:12:21 -0400 Subject: [PATCH] import ntru prime code --- .../crypto_kem/ntrulpr4591761/avx/api.h | 4 + .../crypto_kem/ntrulpr4591761/avx/dec.c | 57 ++ .../crypto_kem/ntrulpr4591761/avx/enc.c | 30 + .../crypto_kem/ntrulpr4591761/avx/hide.c | 40 + .../crypto_kem/ntrulpr4591761/avx/hide.h | 9 + .../ntrulpr4591761/avx/implementors | 5 + .../ntrulpr4591761/avx/int32_sort.c | 425 ++++++++++ .../ntrulpr4591761/avx/int32_sort.h | 9 + .../crypto_kem/ntrulpr4591761/avx/keypair.c | 37 + .../crypto_kem/ntrulpr4591761/avx/modq.h | 36 + .../crypto_kem/ntrulpr4591761/avx/mult.c | 738 +++++++++++++++++ .../crypto_kem/ntrulpr4591761/avx/params.h | 15 + .../ntrulpr4591761/avx/randomweightw.c | 29 + .../crypto_kem/ntrulpr4591761/avx/rq.h | 37 + .../ntrulpr4591761/avx/rq_fromseed.c | 21 + .../crypto_kem/ntrulpr4591761/avx/rq_right.c | 21 + .../ntrulpr4591761/avx/rq_rounded.c | 260 ++++++ .../crypto_kem/ntrulpr4591761/avx/rq_top.c | 17 + .../crypto_kem/ntrulpr4591761/avx/small.c | 44 + .../crypto_kem/ntrulpr4591761/avx/small.h | 27 + .../crypto_kem/ntrulpr4591761/checksumbig | 1 + .../crypto_kem/ntrulpr4591761/checksumsmall | 1 + .../crypto_kem/ntrulpr4591761/description | 1 + .../crypto_kem/ntrulpr4591761/designers | 5 + .../crypto_kem/ntrulpr4591761/ref/README | 30 + .../crypto_kem/ntrulpr4591761/ref/api.h | 4 + .../crypto_kem/ntrulpr4591761/ref/dec.c | 68 ++ .../crypto_kem/ntrulpr4591761/ref/enc.c | 30 + .../crypto_kem/ntrulpr4591761/ref/hide.c | 49 ++ .../crypto_kem/ntrulpr4591761/ref/hide.h | 9 + .../ntrulpr4591761/ref/implementors | 5 + .../ntrulpr4591761/ref/int32_sort.c | 35 + .../ntrulpr4591761/ref/int32_sort.h | 9 + .../crypto_kem/ntrulpr4591761/ref/keypair.c | 39 + .../crypto_kem/ntrulpr4591761/ref/modq.h | 44 + .../crypto_kem/ntrulpr4591761/ref/params.h | 15 + .../ntrulpr4591761/ref/randomweightw.c | 28 + .../crypto_kem/ntrulpr4591761/ref/rq.h | 31 + .../ntrulpr4591761/ref/rq_fromseed.c | 15 + .../crypto_kem/ntrulpr4591761/ref/rq_mult.c | 30 + .../crypto_kem/ntrulpr4591761/ref/rq_round3.c | 10 + .../ntrulpr4591761/ref/rq_rounded.c | 101 +++ .../crypto_kem/ntrulpr4591761/ref/small.c | 37 + .../crypto_kem/ntrulpr4591761/ref/small.h | 27 + .../crypto_kem/sntrup4591761/avx/api.h | 4 + .../crypto_kem/sntrup4591761/avx/dec.c | 67 ++ .../crypto_kem/sntrup4591761/avx/enc.c | 48 ++ .../crypto_kem/sntrup4591761/avx/implementors | 5 + .../crypto_kem/sntrup4591761/avx/int32_sort.c | 425 ++++++++++ .../crypto_kem/sntrup4591761/avx/int32_sort.h | 9 + .../crypto_kem/sntrup4591761/avx/keypair.c | 39 + .../crypto_kem/sntrup4591761/avx/mod3.h | 60 ++ .../crypto_kem/sntrup4591761/avx/modq.h | 91 +++ .../crypto_kem/sntrup4591761/avx/mult.c | 762 ++++++++++++++++++ .../crypto_kem/sntrup4591761/avx/params.h | 14 + .../crypto_kem/sntrup4591761/avx/r3.h | 15 + .../crypto_kem/sntrup4591761/avx/r3_recip.c | 192 +++++ .../sntrup4591761/avx/randomsmall.c | 17 + .../sntrup4591761/avx/randomweightw.c | 17 + .../crypto_kem/sntrup4591761/avx/rq.c | 158 ++++ .../crypto_kem/sntrup4591761/avx/rq.h | 31 + .../crypto_kem/sntrup4591761/avx/rq_mod3.c | 49 ++ .../crypto_kem/sntrup4591761/avx/rq_recip3.c | 215 +++++ .../crypto_kem/sntrup4591761/avx/rq_round3.c | 20 + .../crypto_kem/sntrup4591761/avx/rq_rounded.c | 260 ++++++ .../crypto_kem/sntrup4591761/avx/small.c | 45 ++ .../crypto_kem/sntrup4591761/avx/small.h | 20 + .../crypto_kem/sntrup4591761/avx/swap.c | 32 + .../crypto_kem/sntrup4591761/avx/swap.h | 7 + .../crypto_kem/sntrup4591761/avx/weight.c | 28 + .../crypto_kem/sntrup4591761/checksumbig | 1 + .../crypto_kem/sntrup4591761/checksumsmall | 1 + .../crypto_kem/sntrup4591761/description | 1 + .../crypto_kem/sntrup4591761/designers | 5 + .../crypto_kem/sntrup4591761/ref/README | 32 + .../crypto_kem/sntrup4591761/ref/api.h | 4 + .../crypto_kem/sntrup4591761/ref/dec.c | 71 ++ .../crypto_kem/sntrup4591761/ref/enc.c | 49 ++ .../crypto_kem/sntrup4591761/ref/implementors | 5 + .../crypto_kem/sntrup4591761/ref/int32_sort.c | 35 + .../crypto_kem/sntrup4591761/ref/int32_sort.h | 9 + .../crypto_kem/sntrup4591761/ref/keypair.c | 39 + .../crypto_kem/sntrup4591761/ref/mod3.h | 60 ++ .../crypto_kem/sntrup4591761/ref/modq.h | 92 +++ .../crypto_kem/sntrup4591761/ref/params.h | 14 + .../crypto_kem/sntrup4591761/ref/r3.h | 12 + .../crypto_kem/sntrup4591761/ref/r3_mult.c | 31 + .../crypto_kem/sntrup4591761/ref/r3_recip.c | 126 +++ .../crypto_kem/sntrup4591761/ref/random32.c | 24 + .../sntrup4591761/ref/randomsmall.c | 14 + .../sntrup4591761/ref/randomweightw.c | 16 + .../crypto_kem/sntrup4591761/ref/rq.c | 128 +++ .../crypto_kem/sntrup4591761/ref/rq.h | 28 + .../crypto_kem/sntrup4591761/ref/rq_mult.c | 30 + .../crypto_kem/sntrup4591761/ref/rq_recip3.c | 125 +++ .../crypto_kem/sntrup4591761/ref/rq_round3.c | 10 + .../crypto_kem/sntrup4591761/ref/rq_rounded.c | 101 +++ .../crypto_kem/sntrup4591761/ref/small.c | 37 + .../crypto_kem/sntrup4591761/ref/small.h | 24 + .../crypto_kem/sntrup4591761/ref/swap.c | 19 + .../crypto_kem/sntrup4591761/ref/swap.h | 7 + 101 files changed, 6265 insertions(+) create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/api.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/dec.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/enc.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/implementors create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/keypair.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/modq.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/mult.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/params.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/randomweightw.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_fromseed.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_right.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_rounded.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_top.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumbig create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumsmall create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/description create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/designers create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/README create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/api.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/dec.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/enc.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/implementors create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/keypair.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/modq.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/params.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/randomweightw.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_fromseed.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_mult.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_round3.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_rounded.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/api.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/dec.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/enc.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/implementors create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/keypair.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mod3.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/modq.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mult.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/params.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3_recip.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomsmall.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomweightw.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_mod3.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_recip3.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_round3.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_rounded.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/weight.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumbig create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumsmall create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/description create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/designers create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/README create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/api.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/dec.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/enc.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/implementors create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/keypair.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/mod3.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/modq.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/params.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_mult.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_recip.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/random32.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomsmall.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomweightw.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_mult.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_recip3.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_round3.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_rounded.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.h create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/swap.c create mode 100644 crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/swap.h diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/api.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/api.h new file mode 100644 index 000000000..593d7eb48 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/api.h @@ -0,0 +1,4 @@ +#define CRYPTO_SECRETKEYBYTES 1238 +#define CRYPTO_PUBLICKEYBYTES 1047 +#define CRYPTO_CIPHERTEXTBYTES 1175 +#define CRYPTO_BYTES 32 diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/dec.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/dec.c new file mode 100644 index 000000000..88ef7f206 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/dec.c @@ -0,0 +1,57 @@ +#ifdef KAT +#include +#endif + +#include "params.h" +#include "small.h" +#include "rq.h" +#include "hide.h" +#include "crypto_kem.h" + +static int verify(const unsigned char *x,const unsigned char *y) +{ + unsigned int differentbits = 0; + int i; + for (i = 0;i < crypto_kem_CIPHERTEXTBYTES;++i) + differentbits |= x[i] ^ y[i]; + return (1 & ((differentbits - 1) >> 8)) - 1; +} + +int crypto_kem_dec( + unsigned char *k, + const unsigned char *cstr, + const unsigned char *sk +) +{ + modq buf[768]; +#define B buf +#define aB buf + small a[768]; + unsigned char r[32]; + unsigned char checkcstr[crypto_kem_CIPHERTEXTBYTES]; + unsigned char maybek[32]; + int i; + int result; + + small_decode(a,sk); sk += small_encode_len; + rq_decoderounded(B,cstr + 32); + rq_mult(aB,B,a); + + rq_rightsubbit(r,cstr + 32 + rq_encoderounded_len,aB); + +#ifdef KAT + { + int j; + printf("decrypt r: "); + for (j = 0;j < 32;++j) + printf("%02x",255 & (int) r[j]); + printf("\n"); + } +#endif + + hide(checkcstr,maybek,sk,r); + result = verify(cstr,checkcstr); + + for (i = 0;i < 32;++i) k[i] = maybek[i] & ~result; + return result; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/enc.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/enc.c new file mode 100644 index 000000000..d9790fcb9 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/enc.c @@ -0,0 +1,30 @@ +#ifdef KAT +#include +#endif + +#include "hide.h" +#include "randombytes.h" +#include "crypto_kem.h" + +int crypto_kem_enc( + unsigned char *cstr, + unsigned char *k, + const unsigned char *pk +) +{ + unsigned char r[32]; + randombytes(r,32); + +#ifdef KAT + { + int i; + printf("encrypt r: "); + for (i = 0;i < 32;++i) + printf("%02x",255 & (int) r[i]); + printf("\n"); + } +#endif + + hide(cstr,k,pk,r); + return 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.c new file mode 100644 index 000000000..36b17c0dc --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.c @@ -0,0 +1,40 @@ +#include +#include "crypto_hash_sha512.h" +#include "crypto_kem.h" +#include "params.h" +#include "rq.h" +#include "hide.h" + +#if crypto_kem_CIPHERTEXTBYTES != rq_encoderounded_len + 32 + 128 +#error "crypto_kem_CIPHERTEXTBYTES must match rq_encoderounded_len + 32 + 128" +#endif + +void hide(unsigned char *cstr,unsigned char *k,const unsigned char *pk,const unsigned char *r) +{ + modq buf[768]; +#define G buf +#define A buf +#define B buf +#define C buf + unsigned char k12[64]; + unsigned char k34[64]; + small b[768]; + + crypto_hash_sha512(k12,r,32); + small_seeded_weightw(b,k12); + + crypto_hash_sha512(k34,k12 + 32,32); + memcpy(cstr,k34,32); cstr += 32; + memcpy(k,k34 + 32,32); + + rq_fromseed(G,pk); + rq_mult(B,G,b); + /* XXX: cache transform of b for next mult */ + /* XXX: cache transform of G inside sk */ + /* XXX: cache transform of G when pk is otherwise reused */ + rq_roundencode(cstr,B); cstr += rq_encoderounded_len; + + rq_decoderounded(A,pk + 32); + rq_mult(C,A,b); + rq_top(cstr,C,r); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.h new file mode 100644 index 000000000..0b3ebcddf --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/hide.h @@ -0,0 +1,9 @@ +#ifndef hide_h +#define hide_h + +#include "crypto_int32.h" + +#define hide crypto_kem_ntrulpr4591761_avx_hide +extern void hide(unsigned char *,unsigned char *,const unsigned char *,const unsigned char *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/implementors b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/implementors new file mode 100644 index 000000000..51ac31ea2 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/implementors @@ -0,0 +1,5 @@ +Alphabetical order: +Daniel J. Bernstein +Chitchanok Chuengsatiansup +Tanja Lange +Christine van Vredendaal diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.c new file mode 100644 index 000000000..e950efe6a --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.c @@ -0,0 +1,425 @@ +#include "int32_sort.h" +#include + +typedef crypto_int32 int32; + +static inline void minmax(int32 *x,int32 *y) +{ + asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg %%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)" + : : "r"(x),"r"(y) : "%eax","%ebx","%edx"); +} + +/* sort x0,x2; sort x1,x3; ... sort x13, x15 */ +static inline void minmax02through1315(int32 *x) +{ + __m256i a = _mm256_loadu_si256((__m256i *) x); + __m256i b = _mm256_loadu_si256((__m256i *) (x + 8)); + __m256i c = _mm256_unpacklo_epi64(a,b); /* a01b01a45b45 */ + __m256i d = _mm256_unpackhi_epi64(a,b); /* a23b23a67b67 */ + __m256i g = _mm256_min_epi32(c,d); + __m256i h = _mm256_max_epi32(c,d); + a = _mm256_unpacklo_epi64(g,h); + b = _mm256_unpackhi_epi64(g,h); + _mm256_storeu_si256((__m256i *) x,a); + _mm256_storeu_si256((__m256i *) (x + 8),b); +} + +/* sort x0,x2; sort x1,x3; sort x4,x6; sort x5,x7 */ +static inline void minmax02134657(int32 *x) +{ + __m256i a = _mm256_loadu_si256((__m256i *) x); + __m256i b = _mm256_shuffle_epi32(a,0x4e); + __m256i c = _mm256_cmpgt_epi32(a,b); + c = _mm256_shuffle_epi32(c,0x44); + __m256i abc = c & (a ^ b); + a ^= abc; + _mm256_storeu_si256((__m256i *) x,a); +} + +static void multiminmax2plus2( + int32 *x, + int n) +{ + while (n >= 16) { + minmax02through1315(x); + n -= 16; + x += 16; + } + if (n >= 8) { + minmax02134657(x); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + 2); + minmax(x + 1,x + 3); + n -= 4; + x += 4; + } + if (n > 0) { + minmax(x,x + 2); + if (n > 1) minmax(x + 1,x + 3); + } +} + +static void multiminmax2plus6( + int32 *x, + int n) +{ + while (n >= 4) { + minmax(x,x + 6); + minmax(x + 1,x + 7); + n -= 4; + x += 4; + } + if (n > 0) { + minmax(x,x + 6); + if (n > 1) minmax(x + 1,x + 7); + } +} + +static void multiminmax2plus14( + int32 *x, + int n) +{ + while (n >= 8) { + minmax(x,x + 14); + minmax(x + 1,x + 15); + minmax(x + 4,x + 18); + minmax(x + 5,x + 19); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + 14); + minmax(x + 1,x + 15); + n -= 4; + x += 4; + } + if (n > 0) { + minmax(x,x + 14); + if (n > 1) minmax(x + 1,x + 15); + } +} + +/* sort x[i],y[i] for i in 0,1,4,5,8,9,12,13 */ +/* all of x0...x15 and y0...y15 must exist; no aliasing */ +static inline void minmax0145891213(int32 *x,int32 *y) +{ + __m256i a01234567 = _mm256_loadu_si256((__m256i *) x); + __m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8)); + __m256i b01234567 = _mm256_loadu_si256((__m256i *) y); + __m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8)); + + __m256i a0189451213 = _mm256_unpacklo_epi64(a01234567,a89101112131415); + __m256i b0189451213 = _mm256_unpacklo_epi64(b01234567,b89101112131415); + __m256i c0189451213 = _mm256_min_epi32(a0189451213,b0189451213); + __m256i d0189451213 = _mm256_max_epi32(a0189451213,b0189451213); + + __m256i c01234567 = _mm256_blend_epi32(a01234567,c0189451213,0x33); + __m256i d01234567 = _mm256_blend_epi32(b01234567,d0189451213,0x33); + __m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213,a89101112131415); + __m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213,b89101112131415); + + _mm256_storeu_si256((__m256i *) x,c01234567); + _mm256_storeu_si256((__m256i *) (x + 8),c89101112131415); + _mm256_storeu_si256((__m256i *) y,d01234567); + _mm256_storeu_si256((__m256i *) (y + 8),d89101112131415); +} + +/* offset >= 30 */ +static void multiminmax2plusmore( + int32 *x, + int n, + int offset) +{ + while (n >= 16) { + minmax0145891213(x,x + offset); + n -= 16; + x += 16; + } + if (n >= 8) { + minmax(x,x + offset); + minmax(x + 1,x + 1 + offset); + minmax(x + 4,x + 4 + offset); + minmax(x + 5,x + 5 + offset); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + offset); + minmax(x + 1,x + 1 + offset); + n -= 4; + x += 4; + } + if (n > 0) { + minmax(x,x + offset); + if (n > 1) minmax(x + 1,x + 1 + offset); + } +} + +/* sort x0,x1; ... sort x14, x15 */ +static inline void minmax01through1415(int32 *x) +{ + __m256i a = _mm256_loadu_si256((__m256i *) x); + __m256i b = _mm256_loadu_si256((__m256i *) (x + 8)); + __m256i c = _mm256_unpacklo_epi32(a,b); /* ab0ab1ab4ab5 */ + __m256i d = _mm256_unpackhi_epi32(a,b); /* ab2ab3ab6ab7 */ + __m256i e = _mm256_unpacklo_epi32(c,d); /* a02b02a46b46 */ + __m256i f = _mm256_unpackhi_epi32(c,d); /* a13b13a57b57 */ + __m256i g = _mm256_min_epi32(e,f); /* a02b02a46b46 */ + __m256i h = _mm256_max_epi32(e,f); /* a13b13a57b57 */ + a = _mm256_unpacklo_epi32(g,h); + b = _mm256_unpackhi_epi32(g,h); + _mm256_storeu_si256((__m256i *) x,a); + _mm256_storeu_si256((__m256i *) (x + 8),b); +} + +/* sort x0,x1; sort x2,x3; sort x4,x5; sort x6,x7 */ +static inline void minmax01234567(int32 *x) +{ + __m256i a = _mm256_loadu_si256((__m256i *) x); + __m256i b = _mm256_shuffle_epi32(a,0xb1); + __m256i c = _mm256_cmpgt_epi32(a,b); + c = _mm256_shuffle_epi32(c,0xa0); + __m256i abc = c & (a ^ b); + a ^= abc; + _mm256_storeu_si256((__m256i *) x,a); +} + +static void multiminmax1plus1( + int32 *x, + int n) +{ + while (n >= 16) { + minmax01through1415(x); + n -= 16; + x += 16; + } + if (n >= 8) { + minmax01234567(x); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + 1); + minmax(x + 2,x + 3); + n -= 4; + x += 4; + } + if (n >= 2) { + minmax(x,x + 1); + n -= 2; + x += 2; + } + if (n > 0) + minmax(x,x + 1); +} + +static void multiminmax1( + int32 *x, + int n, + int offset) +{ + while (n >= 16) { + minmax(x,x + offset); + minmax(x + 2,x + 2 + offset); + minmax(x + 4,x + 4 + offset); + minmax(x + 6,x + 6 + offset); + minmax(x + 8,x + 8 + offset); + minmax(x + 10,x + 10 + offset); + minmax(x + 12,x + 12 + offset); + minmax(x + 14,x + 14 + offset); + n -= 16; + x += 16; + } + if (n >= 8) { + minmax(x,x + offset); + minmax(x + 2,x + 2 + offset); + minmax(x + 4,x + 4 + offset); + minmax(x + 6,x + 6 + offset); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + offset); + minmax(x + 2,x + 2 + offset); + n -= 4; + x += 4; + } + if (n >= 2) { + minmax(x,x + offset); + n -= 2; + x += 2; + } + if (n > 0) + minmax(x,x + offset); +} + +/* sort x[i],y[i] for i in 0,2,4,6,8,10,12,14 */ +/* all of x0...x15 and y0...y15 must exist; no aliasing */ +static inline void minmax02468101214(int32 *x,int32 *y) +{ + __m256i a01234567 = _mm256_loadu_si256((__m256i *) x); + __m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8)); + __m256i b01234567 = _mm256_loadu_si256((__m256i *) y); + __m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8)); + + __m256i a0819412513 = _mm256_unpacklo_epi32(a01234567,a89101112131415); + __m256i a210311614715 = _mm256_unpackhi_epi32(a01234567,a89101112131415); + __m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513,a210311614715); + __m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513,a210311614715); + + __m256i b0819412513 = _mm256_unpacklo_epi32(b01234567,b89101112131415); + __m256i b210311614715 = _mm256_unpackhi_epi32(b01234567,b89101112131415); + __m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513,b210311614715); + __m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513,b210311614715); + + __m256i c02810461214 = _mm256_min_epi32(a02810461214,b02810461214); + __m256i d02810461214 = _mm256_max_epi32(a02810461214,b02810461214); + + __m256i c01234567 = _mm256_unpacklo_epi32(c02810461214,a13911571315); + __m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214,a13911571315); + __m256i d01234567 = _mm256_unpacklo_epi32(d02810461214,b13911571315); + __m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214,b13911571315); + + _mm256_storeu_si256((__m256i *) x,c01234567); + _mm256_storeu_si256((__m256i *) (x + 8),c89101112131415); + _mm256_storeu_si256((__m256i *) y,d01234567); + _mm256_storeu_si256((__m256i *) (y + 8),d89101112131415); +} + +/* assumes offset >= 31 */ +static void multiminmax1plusmore( + int32 *x, + int n, + int offset) +{ + while (n >= 16) { + minmax02468101214(x,x + offset); + n -= 16; + x += 16; + } + if (n >= 8) { + minmax(x,x + offset); + minmax(x + 2,x + 2 + offset); + minmax(x + 4,x + 4 + offset); + minmax(x + 6,x + 6 + offset); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + offset); + minmax(x + 2,x + 2 + offset); + n -= 4; + x += 4; + } + if (n >= 2) { + minmax(x,x + offset); + n -= 2; + x += 2; + } + if (n > 0) + minmax(x,x + offset); +} + +/* sort x0,y0; sort x1,y1; ...; sort x7,y7 */ +static inline void minmax8(int32 *x,int32 *y) +{ + __m256i a = _mm256_loadu_si256((__m256i *) x); + __m256i b = _mm256_loadu_si256((__m256i *) y); + _mm256_storeu_si256((__m256i *) x,_mm256_min_epi32(a,b)); + _mm256_storeu_si256((__m256i *) y,_mm256_max_epi32(a,b)); +} + +/* assumes p >= 8; implies offset >= 8 */ +static void multiminmax_atleast8(int p, + int32 *x, + int n, + int offset) +{ + int i; + while (n >= 2 * p) { + for (i = 0;i < p;i += 8) + minmax8(x + i,x + i + offset); + n -= 2 * p; + x += 2 * p; + } + for (i = 0;i + 8 <= n;i += 8) { + if (i & p) return; + minmax8(x + i,x + i + offset); + } + for (;i < n;++i) { + if (i & p) return; + minmax(x + i,x + i + offset); + } +} + +/* sort x0,y0; sort x1,y1; sort x2,y2; sort x3,y3 */ +static inline void minmax4(int32 *x,int32 *y) +{ + __m128i a = _mm_loadu_si128((__m128i *) x); + __m128i b = _mm_loadu_si128((__m128i *) y); + _mm_storeu_si128((__m128i *) x,_mm_min_epi32(a,b)); + _mm_storeu_si128((__m128i *) y,_mm_max_epi32(a,b)); +} + +static void multiminmax4( + int32 *x, + int n, + int offset) +{ + int i; + while (n >= 8) { + minmax4(x,x + offset); + n -= 8; + x += 8; + } + if (n >= 4) + minmax4(x,x + offset); + else + for (i = 0;i < n;++i) + minmax(x + i,x + i + offset); +} + +void int32_sort(int32 *x,int n) +{ + int top,p,q; + + if (n < 2) return; + top = 1; + while (top < n - top) top += top; + + for (p = top;p >= 8;p >>= 1) { + multiminmax_atleast8(p,x,n - p,p); + for (q = top;q > p;q >>= 1) + multiminmax_atleast8(p,x + p,n - q,q - p); + } + if (p >= 4) { + multiminmax4(x,n - 4,4); + for (q = top;q > 4;q >>= 1) + multiminmax4(x + 4,n - q,q - 4); + } + if (p >= 2) { + multiminmax2plus2(x,n - 2); + for (q = top;q >= 32;q >>= 1) + multiminmax2plusmore(x + 2,n - q,q - 2); + if (q >= 16) + multiminmax2plus14(x + 2,n - 16); + if (q >= 8) + multiminmax2plus6(x + 2,n - 8); + if (q >= 4) + multiminmax2plus2(x + 2,n - 4); + } + multiminmax1plus1(x,n - 1); + for (q = top;q >= 32;q >>= 1) + multiminmax1plusmore(x + 1,n - q,q - 1); + if (q >= 16) + multiminmax1(x + 1,n - 16,15); + if (q >= 8) + multiminmax1(x + 1,n - 8,7); + if (q >= 4) + multiminmax1(x + 1,n - 4,3); + if (q >= 2) + multiminmax1plus1(x + 1,n - 2); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.h new file mode 100644 index 000000000..2508b1eff --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef int32_sort_h +#define int32_sort_h + +#include "crypto_int32.h" + +#define int32_sort crypto_kem_ntrulpr4591761_avx_int32_sort +extern void int32_sort(crypto_int32 *,int); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/keypair.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/keypair.c new file mode 100644 index 000000000..e73c44cc7 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/keypair.c @@ -0,0 +1,37 @@ +#include +#include "modq.h" +#include "params.h" +#include "small.h" +#include "rq.h" +#include "crypto_kem.h" +#include "randombytes.h" +#include "crypto_stream_aes256ctr.h" + +#if crypto_kem_PUBLICKEYBYTES != rq_encoderounded_len + 32 +#error "crypto_kem_PUBLICKEYBYTES must match rq_encoderounded_len + 32" +#endif +#if crypto_kem_SECRETKEYBYTES != small_encode_len + crypto_kem_PUBLICKEYBYTES +#error "crypto_kem_SECRETKEYBYTES must match small_encode_len + crypto_kem_PUBLICKEYBYTES" +#endif + +int crypto_kem_keypair(unsigned char *pk,unsigned char *sk) +{ + modq buf[768]; +#define G buf +#define A buf + small a[768]; + + randombytes(pk,32); + rq_fromseed(G,pk); + + small_random_weightw(a); + + rq_mult(A,G,a); + + rq_roundencode(pk + 32,A); + + small_encode(sk,a); + memcpy(sk + small_encode_len,pk,crypto_kem_PUBLICKEYBYTES); + + return 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/modq.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/modq.h new file mode 100644 index 000000000..d87b89bbc --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/modq.h @@ -0,0 +1,36 @@ +#ifndef modq_h +#define modq_h + +#include "crypto_int16.h" +#include "crypto_int32.h" +#include "crypto_uint16.h" +#include "crypto_uint32.h" + +typedef crypto_int16 modq; + +/* input between -9000000 and 9000000 */ +/* output between -2295 and 2295 */ +static inline modq modq_freeze(crypto_int32 a) +{ + a -= 4591 * ((228 * a) >> 20); + a -= 4591 * ((58470 * a + 134217728) >> 28); + return a; +} + +/* input between 0 and 4294967295 */ +/* output = (input % 4591) - 2295 */ +static inline modq modq_fromuint32(crypto_uint32 a) +{ + crypto_int32 r; + r = (a & 524287) + (a >> 19) * 914; /* <= 8010861 */ + return modq_freeze(r - 2295); +} + +static inline modq modq_sum(modq a,modq b) +{ + crypto_int32 A = a; + crypto_int32 B = b; + return modq_freeze(A + B); +} + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/mult.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/mult.c new file mode 100644 index 000000000..f88eb88e3 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/mult.c @@ -0,0 +1,738 @@ +#include +#include +#include "rq.h" + +#define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \ + gj = g[j]; \ + h0 += f0 * gj; \ + _mm256_storeu_ps(&h[i + j],h0); \ + h1 += f1 * gj; \ + h2 += f2 * gj; \ + h3 += f3 * gj; \ + h4 += f4 * gj; \ + h0 = _mm256_loadu_ps(&h[i + j + 5]); \ + h0 += f5 * gj; + +#define MULSTEP_asm(j,h0,h1,h2,h3,h4) \ + gj = g[j]; \ + __asm__( \ + "vfmadd231ps %5,%6,%0 \n\t" \ + "vmovups %0,%12 \n\t" \ + "vmovups %13,%0 \n\t" \ + "vfmadd231ps %5,%7,%1 \n\t" \ + "vfmadd231ps %5,%8,%2 \n\t" \ + "vfmadd231ps %5,%9,%3 \n\t" \ + "vfmadd231ps %5,%10,%4 \n\t" \ + "vfmadd231ps %5,%11,%0 \n\t" \ + : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \ + : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]),"m"(h[i+j+5])); + +#define MULSTEP MULSTEP_asm + +#define MULSTEP_noload(j,h0,h1,h2,h3,h4) \ + gj = g[j]; \ + __asm__( \ + "vfmadd231ps %5,%6,%0 \n\t" \ + "vmovups %0,%12 \n\t" \ + "vfmadd231ps %5,%7,%1 \n\t" \ + "vfmadd231ps %5,%8,%2 \n\t" \ + "vfmadd231ps %5,%9,%3 \n\t" \ + "vfmadd231ps %5,%10,%4 \n\t" \ + "vmulps %5,%11,%0 \n\t" \ + : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \ + : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j])); + +#define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \ + gj = g[j]; \ + __asm__( \ + "vmulps %5,%6,%0 \n\t" \ + "vmovups %0,%12 \n\t" \ + "vmulps %5,%7,%1 \n\t" \ + "vmulps %5,%8,%2 \n\t" \ + "vmulps %5,%9,%3 \n\t" \ + "vmulps %5,%10,%4 \n\t" \ + "vmulps %5,%11,%0 \n\t" \ + : "=&x"(h0),"=&x"(h1),"=&x"(h2),"=&x"(h3),"=&x"(h4) \ + : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j])); + +static inline __m128i _mm_load_cvtepi8_epi16(const long long *x) +{ + __m128i result; + __asm__("vpmovsxbw %1, %0" : "=x"(result) : "m"(*x)); + return result; +} + +#define v0 _mm256_set1_epi32(0) +#define v0_128 _mm_set1_epi32(0) +#define v7 _mm256_set1_epi16(7) +#define v4591_16 _mm256_set1_epi16(4591) +#define v2296_16 _mm256_set1_epi16(2296) + +#define alpha_32 _mm256_set1_epi32(0x4b400000) +#define alpha_32_128 _mm_set1_epi32(0x4b400000) +#define alpha_float _mm256_set1_ps(12582912.0) + +#define v0_float _mm256_set1_ps(0) +#define v1_float _mm256_set1_ps(1) +#define vm1_float _mm256_set1_ps(-1) +#define vm4591_float _mm256_set1_ps(-4591) +#define recip4591_float _mm256_set1_ps(0.00021781746896101067305597908952297974298) + +static inline __m256 add(__m256 x,__m256 y) +{ + return x + y; +} + +static inline __m256 fastadd(__m256 x,__m256 y) +{ + return _mm256_fmadd_ps(y,v1_float,x); +} + +static inline __m256 fastsub(__m256 x,__m256 y) +{ + return _mm256_fmadd_ps(y,vm1_float,x); +} + +static inline __m256 reduce(__m256 x) +{ + __m256 q = x * recip4591_float; + q = _mm256_round_ps(q,8); + return _mm256_fmadd_ps(q,vm4591_float,x); +} + +static inline __m256i squeeze(__m256i x) +{ + __m256i q = _mm256_mulhrs_epi16(x,v7); + q = _mm256_mullo_epi16(q,v4591_16); + return _mm256_sub_epi16(x,q); +} + +static inline __m256i squeezeadd16(__m256i x,__m256i y) +{ + __m256i q; + x = _mm256_add_epi16(x,y); + q = _mm256_mulhrs_epi16(x,v7); + q = _mm256_mullo_epi16(q,v4591_16); + return _mm256_sub_epi16(x,q); +} + +static inline __m256i freeze(__m256i x) +{ + __m256i mask, x2296, x4591; + x4591 = _mm256_add_epi16(x,v4591_16); + mask = _mm256_srai_epi16(x,15); + x = _mm256_blendv_epi8(x,x4591,mask); + x2296 = _mm256_sub_epi16(x,v2296_16); + mask = _mm256_srai_epi16(x2296,15); + x4591 = _mm256_sub_epi16(x,v4591_16); + x = _mm256_blendv_epi8(x4591,x,mask); + return x; +} + +/* 24*8*float32 f inputs between -10000 and 10000 */ +/* 24*8*float32 g inputs between -32 and 32 */ +/* 48*8*float32 h outputs between -7680000 and 7680000 */ +static void mult24x8_float(__m256 h[48],const __m256 f[24],const __m256 g[24]) +{ + int i, j; + __m256 f0, f1, f2, f3, f4, f5, gj, h0, h1, h2, h3, h4; + + i = 0; + f0 = f[i]; + f1 = f[i + 1]; + f2 = f[i + 2]; + f3 = f[i + 3]; + f4 = f[i + 4]; + f5 = f[i + 5]; + MULSTEP_fromzero(0,h0,h1,h2,h3,h4) + for (j = 0;j < 20;j += 5) { + MULSTEP_noload(j + 1,h1,h2,h3,h4,h0) + MULSTEP_noload(j + 2,h2,h3,h4,h0,h1) + MULSTEP_noload(j + 3,h3,h4,h0,h1,h2) + MULSTEP_noload(j + 4,h4,h0,h1,h2,h3) + MULSTEP_noload(j + 5,h0,h1,h2,h3,h4) + } + MULSTEP_noload(j + 1,h1,h2,h3,h4,h0) + MULSTEP_noload(j + 2,h2,h3,h4,h0,h1) + MULSTEP_noload(j + 3,h3,h4,h0,h1,h2) + h[i + j + 4] = h4; + h[i + j + 5] = h0; + h[i + j + 6] = h1; + h[i + j + 7] = h2; + h[i + j + 8] = h3; + + for (i = 6;i < 24;i += 6) { + f0 = f[i]; + f1 = f[i + 1]; + f2 = f[i + 2]; + f3 = f[i + 3]; + f4 = f[i + 4]; + f5 = f[i + 5]; + h0 = h[i]; + h1 = h[i + 1]; + h2 = h[i + 2]; + h3 = h[i + 3]; + h4 = h[i + 4]; + for (j = 0;j < 15;j += 5) { + MULSTEP(j + 0,h0,h1,h2,h3,h4) + MULSTEP(j + 1,h1,h2,h3,h4,h0) + MULSTEP(j + 2,h2,h3,h4,h0,h1) + MULSTEP(j + 3,h3,h4,h0,h1,h2) + MULSTEP(j + 4,h4,h0,h1,h2,h3) + } + MULSTEP(j + 0,h0,h1,h2,h3,h4) + MULSTEP(j + 1,h1,h2,h3,h4,h0) + MULSTEP(j + 2,h2,h3,h4,h0,h1) + MULSTEP_noload(j + 3,h3,h4,h0,h1,h2) + MULSTEP_noload(j + 4,h4,h0,h1,h2,h3) + MULSTEP_noload(j + 5,h0,h1,h2,h3,h4) + MULSTEP_noload(j + 6,h1,h2,h3,h4,h0) + MULSTEP_noload(j + 7,h2,h3,h4,h0,h1) + MULSTEP_noload(j + 8,h3,h4,h0,h1,h2) + h[i + j + 9] = h4; + h[i + j + 10] = h0; + h[i + j + 11] = h1; + h[i + j + 12] = h2; + h[i + j + 13] = h3; + } + + h[47] = v0_float; +} + +/* 48*8*float32 f inputs between -5000 and 5000 */ +/* 48*8*float32 g inputs between -16 and 16 */ +/* 96*8*float32 h outputs between -3840000 and 3840000 */ +static void mult48x8_float(__m256 h[96],const __m256 f[48],const __m256 g[48]) +{ + __m256 h01[48]; + __m256 g01[24]; + __m256 *f01 = h01 + 24; + int i; + + for (i = 24;i > 0;) { + i -= 2; + f01[i] = f[i] + f[i + 24]; + g01[i] = g[i] + g[i + 24]; + f01[i + 1] = f[i + 1] + f[i + 1 + 24]; + g01[i + 1] = g[i + 1] + g[i + 1 + 24]; + } + + mult24x8_float(h,f,g); + mult24x8_float(h + 48,f + 24,g + 24); + mult24x8_float(h01,f01,g01); + + for (i = 0;i < 24;++i) { + __m256 h0i = h[i]; + __m256 h0itop = h[i + 24]; + __m256 h1i = h[i + 48]; + __m256 h1itop = h[i + 72]; + __m256 h01i = h01[i]; + __m256 h01itop = h01[i + 24]; + __m256 c = fastsub(h0itop,h1i); + h[i + 24] = c + fastsub(h01i,h0i); + h[i + 48] = fastsub(h01itop,h1itop) - c; + } +} + +/* 96*8*float32 f inputs between -2500 and 2500 */ +/* 96*8*float32 g inputs between -8 and 8 */ +/* 192*8*float32 h outputs between -1920000 and 1920000 */ +static void mult96x8_float(__m256 h[192],const __m256 f[96],const __m256 g[96]) +{ + __m256 h01[96]; + __m256 g01[48]; + __m256 *f01 = h01 + 48; + int i; + + for (i = 48;i > 0;) { + i -= 4; + f01[i] = f[i] + f[i + 48]; + g01[i] = g[i] + g[i + 48]; + f01[i + 1] = f[i + 1] + f[i + 1 + 48]; + g01[i + 1] = g[i + 1] + g[i + 1 + 48]; + f01[i + 2] = f[i + 2] + f[i + 2 + 48]; + g01[i + 2] = g[i + 2] + g[i + 2 + 48]; + f01[i + 3] = f[i + 3] + f[i + 3 + 48]; + g01[i + 3] = g[i + 3] + g[i + 3 + 48]; + } + + mult48x8_float(h,f,g); + mult48x8_float(h + 96,f + 48,g + 48); + mult48x8_float(h01,f01,g01); + + for (i = 0;i < 48;++i) { + __m256 h0i = h[i]; + __m256 h0itop = h[i + 48]; + __m256 h1i = h[i + 96]; + __m256 h1itop = h[i + 144]; + __m256 h01i = h01[i]; + __m256 h01itop = h01[i + 48]; + __m256 c = fastsub(h0itop,h1i); + h[i + 48] = c + fastsub(h01i,h0i); + h[i + 96] = fastsub(h01itop,h1itop) - c; + } +} + +/* 96*16*int16 f inputs between -2500 and 2500 */ +/* 96*(16*int8 stored in 32*int8) g inputs between -8 and 8 */ +/* 192*16*int16 h outputs between -2400 and 2400 */ +static void mult96x16(__m256i h[192],const __m256i f[96],const __m256i g[96]) +{ + __m256 hfloat[192]; + __m256 gfloat[96]; + __m256 *ffloat = hfloat + 96; + int i, p; + + for (p = 0;p < 2;++p) { + for (i = 96;i > 0;) { + i -= 2; + __m256i fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i])); + __m256i gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i])); + __m256 storage; + *(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32); + ffloat[i] = storage - alpha_float; + *(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32); + gfloat[i] = storage - alpha_float; + fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i + 1])); + gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i + 1])); + *(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32); + ffloat[i + 1] = storage - alpha_float; + *(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32); + gfloat[i + 1] = storage - alpha_float; + } + mult96x8_float(hfloat,ffloat,gfloat); + for (i = 192;i > 0;) { + __m128i h0, h1; + i -= 4; + hfloat[i] = add(alpha_float,reduce(hfloat[i])); + hfloat[i + 1] = fastadd(alpha_float,reduce(hfloat[i + 1])); + hfloat[i + 2] = add(alpha_float,reduce(hfloat[i + 2])); + hfloat[i + 3] = fastadd(alpha_float,reduce(hfloat[i + 3])); + h0 = 0[(__m128i *) &hfloat[i]]; h0 = _mm_sub_epi32(h0,alpha_32_128); + h1 = 1[(__m128i *) &hfloat[i]]; h1 = _mm_sub_epi32(h1,alpha_32_128); + _mm_storeu_si128(p + (__m128i *) &h[i],_mm_packs_epi32(h0,h1)); + h0 = 0[(__m128i *) &hfloat[i + 1]]; h0 = _mm_sub_epi32(h0,alpha_32_128); + h1 = 1[(__m128i *) &hfloat[i + 1]]; h1 = _mm_sub_epi32(h1,alpha_32_128); + _mm_storeu_si128(p + (__m128i *) &h[i + 1],_mm_packs_epi32(h0,h1)); + h0 = 0[(__m128i *) &hfloat[i + 2]]; h0 = _mm_sub_epi32(h0,alpha_32_128); + h1 = 1[(__m128i *) &hfloat[i + 2]]; h1 = _mm_sub_epi32(h1,alpha_32_128); + _mm_storeu_si128(p + (__m128i *) &h[i + 2],_mm_packs_epi32(h0,h1)); + h0 = 0[(__m128i *) &hfloat[i + 3]]; h0 = _mm_sub_epi32(h0,alpha_32_128); + h1 = 1[(__m128i *) &hfloat[i + 3]]; h1 = _mm_sub_epi32(h1,alpha_32_128); + _mm_storeu_si128(p + (__m128i *) &h[i + 3],_mm_packs_epi32(h0,h1)); + } + } +} + +/* int16 i of output x[j] is int16 j of input x[i] */ +static void transpose16(__m256i x[16]) +{ + const static int rev[4] = {0,4,2,6}; + int i; + __m256i y[16]; + + for (i = 0;i < 16;i += 4) { + __m256i a0 = x[i]; + __m256i a1 = x[i + 1]; + __m256i a2 = x[i + 2]; + __m256i a3 = x[i + 3]; + __m256i b0 = _mm256_unpacklo_epi16(a0,a1); + __m256i b1 = _mm256_unpackhi_epi16(a0,a1); + __m256i b2 = _mm256_unpacklo_epi16(a2,a3); + __m256i b3 = _mm256_unpackhi_epi16(a2,a3); + __m256i c0 = _mm256_unpacklo_epi32(b0,b2); + __m256i c2 = _mm256_unpackhi_epi32(b0,b2); + __m256i c1 = _mm256_unpacklo_epi32(b1,b3); + __m256i c3 = _mm256_unpackhi_epi32(b1,b3); + y[i] = c0; + y[i + 2] = c2; + y[i + 1] = c1; + y[i + 3] = c3; + } + for (i = 0;i < 4;++i) { + int r = rev[i]; + __m256i c0 = y[i]; + __m256i c4 = y[i + 4]; + __m256i c8 = y[i + 8]; + __m256i c12 = y[i + 12]; + __m256i d0 = _mm256_unpacklo_epi64(c0,c4); + __m256i d4 = _mm256_unpackhi_epi64(c0,c4); + __m256i d8 = _mm256_unpacklo_epi64(c8,c12); + __m256i d12 = _mm256_unpackhi_epi64(c8,c12); + __m256i e0 = _mm256_permute2x128_si256(d0,d8,0x20); + __m256i e8 = _mm256_permute2x128_si256(d0,d8,0x31); + __m256i e4 = _mm256_permute2x128_si256(d4,d12,0x20); + __m256i e12 = _mm256_permute2x128_si256(d4,d12,0x31); + x[r] = e0; + x[r + 8] = e8; + x[r + 1] = e4; + x[r + 9] = e12; + } +} + +/* byte i of output x[j] is byte j of input x[i] */ +static void transpose32(__m256i x[32]) +{ + const static int rev[4] = {0,8,4,12}; + int i; + __m256i y[32]; + + for (i = 0;i < 32;i += 4) { + __m256i a0 = x[i]; + __m256i a1 = x[i + 1]; + __m256i a2 = x[i + 2]; + __m256i a3 = x[i + 3]; + __m256i b0 = _mm256_unpacklo_epi8(a0,a1); + __m256i b1 = _mm256_unpackhi_epi8(a0,a1); + __m256i b2 = _mm256_unpacklo_epi8(a2,a3); + __m256i b3 = _mm256_unpackhi_epi8(a2,a3); + __m256i c0 = _mm256_unpacklo_epi16(b0,b2); + __m256i c2 = _mm256_unpackhi_epi16(b0,b2); + __m256i c1 = _mm256_unpacklo_epi16(b1,b3); + __m256i c3 = _mm256_unpackhi_epi16(b1,b3); + y[i] = c0; + y[i + 2] = c2; + y[i + 1] = c1; + y[i + 3] = c3; + } + for (i = 0;i < 4;++i) { + int r = rev[i]; + __m256i c0 = y[i]; + __m256i c8 = y[i + 8]; + __m256i c16 = y[i + 16]; + __m256i c24 = y[i + 24]; + __m256i c4 = y[i + 4]; + __m256i c12 = y[i + 12]; + __m256i c20 = y[i + 20]; + __m256i c28 = y[i + 28]; + __m256i d0 = _mm256_unpacklo_epi32(c0,c4); + __m256i d4 = _mm256_unpackhi_epi32(c0,c4); + __m256i d8 = _mm256_unpacklo_epi32(c8,c12); + __m256i d12 = _mm256_unpackhi_epi32(c8,c12); + __m256i d16 = _mm256_unpacklo_epi32(c16,c20); + __m256i d20 = _mm256_unpackhi_epi32(c16,c20); + __m256i d24 = _mm256_unpacklo_epi32(c24,c28); + __m256i d28 = _mm256_unpackhi_epi32(c24,c28); + __m256i e0 = _mm256_unpacklo_epi64(d0,d8); + __m256i e8 = _mm256_unpackhi_epi64(d0,d8); + __m256i e16 = _mm256_unpacklo_epi64(d16,d24); + __m256i e24 = _mm256_unpackhi_epi64(d16,d24); + __m256i e4 = _mm256_unpacklo_epi64(d4,d12); + __m256i e12 = _mm256_unpackhi_epi64(d4,d12); + __m256i e20 = _mm256_unpacklo_epi64(d20,d28); + __m256i e28 = _mm256_unpackhi_epi64(d20,d28); + __m256i f0 = _mm256_permute2x128_si256(e0,e16,0x20); + __m256i f16 = _mm256_permute2x128_si256(e0,e16,0x31); + __m256i f8 = _mm256_permute2x128_si256(e8,e24,0x20); + __m256i f24 = _mm256_permute2x128_si256(e8,e24,0x31); + __m256i f4 = _mm256_permute2x128_si256(e4,e20,0x20); + __m256i f20 = _mm256_permute2x128_si256(e4,e20,0x31); + __m256i f12 = _mm256_permute2x128_si256(e12,e28,0x20); + __m256i f28 = _mm256_permute2x128_si256(e12,e28,0x31); + x[r] = f0; + x[r + 16] = f16; + x[r + 1] = f8; + x[r + 17] = f24; + x[r + 2] = f4; + x[r + 18] = f20; + x[r + 3] = f12; + x[r + 19] = f28; + } +} + +/* 48*16*int16 f inputs between -2295 and 2295 */ +/* 24*32*int8 g inputs between -1 and 1 */ +/* 96*16*int16 h outputs between -2295 and 2295 */ +static void mult768_mix2_m256i(__m256i h[96],const __m256i f[48],const __m256i g[24]) +{ + __m256i hkara[24][16]; + __m256i gkara[3][32]; +#define fkara hkara + int i; + + for (i = 6;i-- > 0;) { + __m256i f0, f1, f2, f3, f4, f5, f6, f7; + __m256i f01, f23, f45, f67; + __m256i f02, f46, f04, f26, f0426; + __m256i f13, f57, f15, f37, f1537; + __m256i f0213, f4657, f04261537, f0415, f2637; + + f0 = _mm256_loadu_si256(&f[i + 0]); + f1 = _mm256_loadu_si256(&f[i + 6]); + f2 = _mm256_loadu_si256(&f[i + 12]); + f3 = _mm256_loadu_si256(&f[i + 18]); + f4 = _mm256_loadu_si256(&f[i + 24]); + f5 = _mm256_loadu_si256(&f[i + 30]); + f6 = _mm256_loadu_si256(&f[i + 36]); + f7 = _mm256_loadu_si256(&f[i + 42]); + f01 = squeezeadd16(f0,f1); fkara[i][8] = f01; + f23 = squeezeadd16(f2,f3); fkara[i][9] = f23; + f45 = squeezeadd16(f4,f5); fkara[i][10] = f45; + f67 = squeezeadd16(f6,f7); fkara[i][11] = f67; + + fkara[i][0] = f0; + fkara[i][2] = f2; + fkara[i][4] = f4; + fkara[i][6] = f6; + + f02 = squeezeadd16(f0,f2); fkara[i + 6][0] = f02; + f04 = squeezeadd16(f0,f4); fkara[i + 6][6] = f04; + f46 = squeezeadd16(f4,f6); fkara[i + 6][3] = f46; + f26 = squeezeadd16(f2,f6); fkara[i + 6][8] = f26; + + fkara[i][1] = f1; + fkara[i][3] = f3; + fkara[i][5] = f5; + fkara[i][7] = f7; + + f13 = squeezeadd16(f1,f3); fkara[i + 6][1] = f13; + f15 = squeezeadd16(f1,f5); fkara[i + 6][7] = f15; + f57 = squeezeadd16(f5,f7); fkara[i + 6][4] = f57; + f37 = squeezeadd16(f3,f7); fkara[i + 6][9] = f37; + + f0426 = squeezeadd16(f04,f26); fkara[i + 6][12] = f0426; + f1537 = squeezeadd16(f15,f37); fkara[i + 6][13] = f1537; + f0213 = squeezeadd16(f02,f13); fkara[i + 6][2] = f0213; + f4657 = squeezeadd16(f46,f57); fkara[i + 6][5] = f4657; + f0415 = squeezeadd16(f04,f15); fkara[i + 6][10] = f0415; + f2637 = squeezeadd16(f26,f37); fkara[i + 6][11] = f2637; + f04261537 = squeezeadd16(f0426,f1537); fkara[i + 6][14] = f04261537; + + fkara[i][12] = v0; + fkara[i][13] = v0; + fkara[i][14] = v0; + fkara[i][15] = v0; + fkara[i + 6][15] = v0; + } + + for (i = 3;i-- > 0;) { + __m256i g0, g1, g2, g3, g4, g5, g6, g7; + __m256i g01, g23, g45, g67; + __m256i g02, g46, g04, g26, g0426; + __m256i g13, g57, g15, g37, g1537; + __m256i g0213, g4657, g04261537, g0415, g2637; + + g0 = _mm256_loadu_si256(&g[i + 0]); + g1 = _mm256_loadu_si256(&g[i + 3]); + g2 = _mm256_loadu_si256(&g[i + 6]); + g3 = _mm256_loadu_si256(&g[i + 9]); + g4 = _mm256_loadu_si256(&g[i + 12]); + g5 = _mm256_loadu_si256(&g[i + 15]); + g6 = _mm256_loadu_si256(&g[i + 18]); + g7 = _mm256_loadu_si256(&g[i + 21]); + g01 = _mm256_add_epi8(g0,g1); gkara[i][8] = g01; + g23 = _mm256_add_epi8(g2,g3); gkara[i][9] = g23; + g45 = _mm256_add_epi8(g4,g5); gkara[i][10] = g45; + g67 = _mm256_add_epi8(g6,g7); gkara[i][11] = g67; + + gkara[i][0] = g0; + gkara[i][2] = g2; + gkara[i][4] = g4; + gkara[i][6] = g6; + + g02 = _mm256_add_epi8(g0,g2); gkara[i][16] = g02; + g04 = _mm256_add_epi8(g0,g4); gkara[i][22] = g04; + g46 = _mm256_add_epi8(g4,g6); gkara[i][19] = g46; + g26 = _mm256_add_epi8(g2,g6); gkara[i][24] = g26; + + gkara[i][1] = g1; + gkara[i][3] = g3; + gkara[i][5] = g5; + gkara[i][7] = g7; + + g13 = _mm256_add_epi8(g1,g3); gkara[i][17] = g13; + g15 = _mm256_add_epi8(g1,g5); gkara[i][23] = g15; + g57 = _mm256_add_epi8(g5,g7); gkara[i][20] = g57; + g37 = _mm256_add_epi8(g3,g7); gkara[i][25] = g37; + + g0426 = _mm256_add_epi8(g04,g26); gkara[i][28] = g0426; + g1537 = _mm256_add_epi8(g15,g37); gkara[i][29] = g1537; + g0213 = _mm256_add_epi8(g02,g13); gkara[i][18] = g0213; + g4657 = _mm256_add_epi8(g46,g57); gkara[i][21] = g4657; + g0415 = _mm256_add_epi8(g04,g15); gkara[i][26] = g0415; + g2637 = _mm256_add_epi8(g26,g37); gkara[i][27] = g2637; + g04261537 = _mm256_add_epi8(g0426,g1537); gkara[i][30] = g04261537; + + gkara[i][12] = v0; + gkara[i][13] = v0; + gkara[i][14] = v0; + gkara[i][15] = v0; + gkara[i][31] = v0; + } + + for (i = 12;i-- > 0;) + transpose16(fkara[i]); + for (i = 3;i-- > 0;) + transpose32(gkara[i]); + + mult96x16(hkara[12],fkara[6],(__m256i *) (1 + (__m128i *) gkara)); + mult96x16(hkara[0],fkara[0],gkara[0]); + + for (i = 24;i-- > 0;) + transpose16(hkara[i]); + + for (i = 6;i-- > 0;) { + __m256i h0,h1,h2,h3,h4,h5,h6,h7,h8,h9; + __m256i h10,h11,h12,h13,h14,h15,h16,h17,h18,h19; + __m256i h20,h21,h22,h23; + __m256i h32,h33,h34,h35,h36,h37,h38,h39; + __m256i h40,h41,h42,h43,h44,h45,h46,h47,h48,h49; + __m256i h50,h51,h52,h53,h54,h55,h56,h57,h58,h59; + __m256i h60,h61; + __m256i c; + +#define COMBINE(h0,h1,h2,h3,x0,x1) \ + c = _mm256_sub_epi16(h1,h2); \ + h1 = _mm256_sub_epi16(_mm256_add_epi16(c,x0),h0); \ + h2 = _mm256_sub_epi16(x1,_mm256_add_epi16(c,h3)); \ + h1 = squeeze(h1); \ + h2 = squeeze(h2); + + h56 = hkara[i + 12][12]; + h57 = hkara[i + 18][12]; + h58 = hkara[i + 12][13]; + h59 = hkara[i + 18][13]; + h60 = hkara[i + 12][14]; + h61 = hkara[i + 18][14]; + COMBINE(h56,h57,h58,h59,h60,h61) + + h44 = hkara[i + 12][6]; + h45 = hkara[i + 18][6]; + h46 = hkara[i + 12][7]; + h47 = hkara[i + 18][7]; + h52 = hkara[i + 12][10]; + h53 = hkara[i + 18][10]; + COMBINE(h44,h45,h46,h47,h52,h53) + + h48 = hkara[i + 12][8]; + h49 = hkara[i + 18][8]; + h50 = hkara[i + 12][9]; + h51 = hkara[i + 18][9]; + h54 = hkara[i + 12][11]; + h55 = hkara[i + 18][11]; + COMBINE(h48,h49,h50,h51,h54,h55) + COMBINE(h44,h46,h48,h50,h56,h58) + COMBINE(h45,h47,h49,h51,h57,h59) + + h0 = hkara[i][0]; + h1 = hkara[i + 6][0]; + h2 = hkara[i][1]; + h3 = hkara[i + 6][1]; + h16 = hkara[i][8]; + h17 = hkara[i + 6][8]; + COMBINE(h0,h1,h2,h3,h16,h17) + + h4 = hkara[i][2]; + h5 = hkara[i + 6][2]; + h6 = hkara[i][3]; + h7 = hkara[i + 6][3]; + h18 = hkara[i][9]; + h19 = hkara[i + 6][9]; + COMBINE(h4,h5,h6,h7,h18,h19) + + h32 = hkara[i + 12][0]; + h33 = hkara[i + 18][0]; + h34 = hkara[i + 12][1]; + h35 = hkara[i + 18][1]; + h36 = hkara[i + 12][2]; + h37 = hkara[i + 18][2]; + COMBINE(h32,h33,h34,h35,h36,h37) + COMBINE(h1,h3,h5,h7,h33,h35) + COMBINE(h0,h2,h4,h6,h32,h34) + + h8 = hkara[i][4]; + h9 = hkara[i + 6][4]; + h10 = hkara[i][5]; + h11 = hkara[i + 6][5]; + h20 = hkara[i][10]; + h21 = hkara[i + 6][10]; + COMBINE(h8,h9,h10,h11,h20,h21) + + h12 = hkara[i][6]; + h13 = hkara[i + 6][6]; + h14 = hkara[i][7]; + h15 = hkara[i + 6][7]; + h22 = hkara[i][11]; + h23 = hkara[i + 6][11]; + COMBINE(h12,h13,h14,h15,h22,h23) + + h38 = hkara[i + 12][3]; + h39 = hkara[i + 18][3]; + h40 = hkara[i + 12][4]; + h41 = hkara[i + 18][4]; + h42 = hkara[i + 12][5]; + h43 = hkara[i + 18][5]; + COMBINE(h38,h39,h40,h41,h42,h43) + COMBINE(h8,h10,h12,h14,h38,h40) + COMBINE(h9,h11,h13,h15,h39,h41) + + COMBINE(h0,h4,h8,h12,h44,h48) + h0 = freeze(h0); + h4 = freeze(h4); + h8 = freeze(h8); + h12 = freeze(h12); + _mm256_storeu_si256(&h[i + 0],h0); + _mm256_storeu_si256(&h[i + 24],h4); + _mm256_storeu_si256(&h[i + 48],h8); + _mm256_storeu_si256(&h[i + 72],h12); + + COMBINE(h1,h5,h9,h13,h45,h49) + h1 = freeze(h1); + h5 = freeze(h5); + h9 = freeze(h9); + h13 = freeze(h13); + _mm256_storeu_si256(&h[i + 6],h1); + _mm256_storeu_si256(&h[i + 30],h5); + _mm256_storeu_si256(&h[i + 54],h9); + _mm256_storeu_si256(&h[i + 78],h13); + + COMBINE(h2,h6,h10,h14,h46,h50) + h2 = freeze(h2); + h6 = freeze(h6); + h10 = freeze(h10); + h14 = freeze(h14); + _mm256_storeu_si256(&h[i + 12],h2); + _mm256_storeu_si256(&h[i + 36],h6); + _mm256_storeu_si256(&h[i + 60],h10); + _mm256_storeu_si256(&h[i + 84],h14); + + COMBINE(h3,h7,h11,h15,h47,h51) + h3 = freeze(h3); + h7 = freeze(h7); + h11 = freeze(h11); + h15 = freeze(h15); + _mm256_storeu_si256(&h[i + 18],h3); + _mm256_storeu_si256(&h[i + 42],h7); + _mm256_storeu_si256(&h[i + 66],h11); + _mm256_storeu_si256(&h[i + 90],h15); + } +} + +#define p 761 + +/* 761 f inputs between -2295 and 2295 */ +/* 761 g inputs between -1 and 1 */ +/* 761 h outputs between -2295 and 2295 */ +void rq_mult(modq *h,const modq *f,const small *g) +{ + __m256i fgvec[96]; + modq *fg; + int i; + + mult768_mix2_m256i(fgvec,(__m256i *) f,(__m256i *) g); + fg = (modq *) fgvec; + + h[0] = modq_freeze(fg[0] + fg[p]); + for (i = 1;i < 9;++i) + h[i] = modq_freeze(fg[i] + fg[i + p - 1] + fg[i + p]); + for (i = 9;i < 761;i += 16) { + __m256i fgi = _mm256_loadu_si256((__m256i *) &fg[i]); + __m256i fgip = _mm256_loadu_si256((__m256i *) &fg[i + p]); + __m256i fgip1 = _mm256_loadu_si256((__m256i *) &fg[i + p - 1]); + __m256i x = _mm256_add_epi16(fgi,_mm256_add_epi16(fgip,fgip1)); + x = freeze(squeeze(x)); + _mm256_storeu_si256((__m256i *) &h[i],x); + } + for (i = 761;i < 768;++i) + h[i] = 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/params.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/params.h new file mode 100644 index 000000000..8c0d74f8b --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/params.h @@ -0,0 +1,15 @@ +#ifndef params_h +#define params_h + +#define q 4591 +/* XXX: also built into modq in various ways */ + +#define qshift 2295 +#define p 761 +#define w 250 + +#define rq_encode_len 1218 +#define rq_encoderounded_len 1015 +#define small_encode_len 191 + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/randomweightw.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/randomweightw.c new file mode 100644 index 000000000..cf389f7ce --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/randomweightw.c @@ -0,0 +1,29 @@ +#include "params.h" +#include "randombytes.h" +#include "int32_sort.h" +#include "small.h" +#include "crypto_stream_aes256ctr.h" + +static const unsigned char n[16] = {0}; + +void small_seeded_weightw(small *f,const unsigned char *k) +{ + crypto_int32 r[768]; + int i; + + crypto_stream_aes256ctr((unsigned char *) r,sizeof r,n,k); + for (i = 0;i < p;++i) r[i] ^= 0x80000000; + + for (i = 0;i < w;++i) r[i] &= -2; + for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1; + int32_sort(r,p); + for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1; + for (i = p;i < 768;++i) f[i] = 0; +} + +void small_random_weightw(small *f) +{ + unsigned char k[32]; + randombytes(k,32); + small_seeded_weightw(f,k); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq.h new file mode 100644 index 000000000..be9679c33 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq.h @@ -0,0 +1,37 @@ +#ifndef rq_h +#define rq_h + +#include "modq.h" +#include "small.h" + +#define rq_encode crypto_kem_ntrulpr4591761_avx_rq_encode +extern void rq_encode(unsigned char *,const modq *); + +#define rq_decode crypto_kem_ntrulpr4591761_avx_rq_decode +extern void rq_decode(modq *,const unsigned char *); + +#define rq_roundencode crypto_kem_ntrulpr4591761_avx_rq_roundencode +extern void rq_roundencode(unsigned char *,const modq *); + +#define rq_decoderounded crypto_kem_ntrulpr4591761_avx_rq_decoderounded +extern void rq_decoderounded(modq *,const unsigned char *); + +#define rq_round3 crypto_kem_ntrulpr4591761_avx_rq_round +extern void rq_round3(modq *,const modq *); + +#define rq_mult crypto_kem_ntrulpr4591761_avx_rq_mult +extern void rq_mult(modq *,const modq *,const small *); + +#define rq_recip3 crypto_kem_ntrulpr4591761_avx_rq_recip3 +int rq_recip3(modq *,const small *); + +#define rq_fromseed crypto_kem_ntrulpr4591761_avx_rq_fromseed +extern void rq_fromseed(modq *,const unsigned char *); + +#define rq_top crypto_kem_ntrulpr4591761_avx_rq_top +extern void rq_top(unsigned char *,const modq *,const unsigned char *); + +#define rq_rightsubbit crypto_kem_ntrulpr4591761_avx_rq_rightsubbit +extern void rq_rightsubbit(unsigned char *,const unsigned char *,const modq *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_fromseed.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_fromseed.c new file mode 100644 index 000000000..24e180421 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_fromseed.c @@ -0,0 +1,21 @@ +#include "crypto_stream_aes256ctr.h" +#include "rq.h" +#include "params.h" + +static const unsigned char n[16] = {0}; + +void rq_fromseed(modq *h,const unsigned char *K) +{ + crypto_uint32 buf[768]; + int i; + + crypto_stream_aes256ctr((unsigned char *) buf,sizeof buf,n,K); + /* will use 761*4 bytes */ + /* convenient for aes to generate multiples of 16 bytes */ + /* and multiples of more for some implementations */ + + for (i = 0;i < p;++i) + h[i] = modq_fromuint32(buf[i]); + for (i = p;i < 768;++i) + h[i] = 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_right.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_right.c new file mode 100644 index 000000000..bec9a210f --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_right.c @@ -0,0 +1,21 @@ +#include "rq.h" +#include "params.h" + +void rq_rightsubbit(unsigned char *r,const unsigned char *c,const modq *ab) +{ + modq t[256]; + int i; + + for (i = 0;i < 128;++i) { + crypto_uint32 x = c[i]; + t[2*i] = (x & 15) * 287 - 2007; + t[2*i+1] = (x >> 4) * 287 - 2007; + } + + for (i = 0;i < 256;++i) + t[i] = -(modq_freeze(t[i] - ab[i] + 4*w+1) >> 14); + + for (i = 0;i < 32;++i) r[i] = 0; + for (i = 0;i < 256;++i) + r[i / 8] |= (t[i] << (i & 7)); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_rounded.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_rounded.c new file mode 100644 index 000000000..05b674635 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_rounded.c @@ -0,0 +1,260 @@ +#include +#include "params.h" +#include "crypto_uint32.h" +#include "rq.h" + +#define alpha_top _mm256_set1_epi32(0x43380000) +#define alpha _mm256_set1_pd(6755399441055744.0) +#define v10923_16 _mm256_set1_epi16(10923) +#define floor(x) _mm256_floor_pd(x) + +void rq_roundencode(unsigned char *c,const modq *f) +{ + int i; + __m256i h[50]; + + for (i = 0;i < 208;i += 16) { + __m256i a0, a1, a2, b0, b1, b2, c0, c1, c2, d0, d1, d2; + __m256i e0, e1, f0, f1, g0, g1; + a0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[0])); + a1 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[8])); + a2 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[16])); + a0 = _mm256_inserti128_si256(a0,_mm_loadu_si128((__m128i *) &f[24]),1); + a1 = _mm256_inserti128_si256(a1,_mm_loadu_si128((__m128i *) &f[32]),1); + a2 = _mm256_inserti128_si256(a2,_mm_loadu_si128((__m128i *) &f[40]),1); + f += 48; + + a0 = _mm256_mulhrs_epi16(a0,v10923_16); + a1 = _mm256_mulhrs_epi16(a1,v10923_16); + a2 = _mm256_mulhrs_epi16(a2,v10923_16); + + /* a0: a0 a1 a2 b0 b1 b2 c0 c1 and similar second half */ + /* a1: c2 d0 d1 d2 e0 e1 e2 f0 */ + /* a2: f1 f2 g0 g1 g2 h0 h1 h2 */ + + b1 = _mm256_blend_epi16(a2,a0,0xf0); + b1 = _mm256_shuffle_epi32(b1,0x4e); + b0 = _mm256_blend_epi16(a0,a1,0xf0); + b2 = _mm256_blend_epi16(a1,a2,0xf0); + /* XXX: use shufps instead? */ + + /* b0: a0 a1 a2 b0 e0 e1 e2 f0 */ + /* b1: b1 b2 c0 c1 f1 f2 g0 g1 */ + /* b2: c2 d0 d1 d2 g2 h0 h1 h2 */ + + c1 = _mm256_blend_epi16(b2,b0,0xcc); + c1 = _mm256_shuffle_epi32(c1,0xb1); + c0 = _mm256_blend_epi16(b0,b1,0xcc); + c2 = _mm256_blend_epi16(b1,b2,0xcc); + + /* c0: a0 a1 c0 c1 e0 e1 g0 g1 */ + /* c1: a2 b0 c2 d0 e2 f0 g2 h0 */ + /* c2: b1 b2 d1 d2 f1 f2 h1 h2 */ + + d1 = _mm256_blend_epi16(c2,c0,0xaa); + d1 = _mm256_shufflelo_epi16(d1,0xb1); + d1 = _mm256_shufflehi_epi16(d1,0xb1); + d0 = _mm256_blend_epi16(c0,c1,0xaa); + d2 = _mm256_blend_epi16(c1,c2,0xaa); + + /* d0: a0 b0 c0 d0 e0 f0 g0 h0 */ + /* d1: a1 b1 c1 d1 e1 f1 g1 h1 */ + /* d2: a2 b2 c2 d2 e2 f2 g2 h2 */ + + d0 = _mm256_add_epi16(d0,_mm256_set1_epi16(765)); + d1 = _mm256_add_epi16(d1,_mm256_set1_epi16(765)); + d2 = _mm256_add_epi16(d2,_mm256_set1_epi16(765)); + /* want bytes of d0 + 1536*d1 + 1536*1536*d2 */ + + e0 = d0 & _mm256_set1_epi16(0xff); + d0 = _mm256_srli_epi16(d0,8); + /* want e0, d0 + 6*d1 + 6*1536*d2 */ + + d1 = _mm256_mullo_epi16(d1,_mm256_set1_epi16(6)); + d0 = _mm256_add_epi16(d0,d1); + /* want e0, d0 + 6*1536*d2 */ + + e1 = _mm256_slli_epi16(d0,8); + e0 = _mm256_add_epi16(e0,e1); + d0 = _mm256_srli_epi16(d0,8); + /* want e0, d0 + 36*d2 */ + + d2 = _mm256_mullo_epi16(d2,_mm256_set1_epi16(36)); + e1 = _mm256_add_epi16(d0,d2); + /* want e0, e1 */ + + /* e0: out0 out1 out4 out5 out8 out9 ... */ + /* e1: out2 out3 out6 out7 out10 out11 ... */ + + f0 = _mm256_unpacklo_epi16(e0,e1); + f1 = _mm256_unpackhi_epi16(e0,e1); + + g0 = _mm256_permute2x128_si256(f0,f1,0x20); + g1 = _mm256_permute2x128_si256(f0,f1,0x31); + + _mm256_storeu_si256((__m256i *) c,g0); + _mm256_storeu_si256((__m256i *) (c + 32),g1); + c += 64; + } + + for (i = 0;i < 9;++i) { + __m256i x = _mm256_loadu_si256((__m256i *) &f[16 * i]); + _mm256_storeu_si256(&h[i],_mm256_mulhrs_epi16(x,v10923_16)); + } + f = (const modq *) h; + + for (i = 208;i < 253;++i) { + crypto_int32 f0, f1, f2; + f0 = *f++; + f1 = *f++; + f2 = *f++; + f0 += 1806037245; + f1 *= 3; + f2 *= 9; + f0 += f1 << 9; + f0 += f2 << 18; + *(crypto_int32 *) c = f0; + c += 4; + } + { + crypto_int32 f0, f1; + f0 = *f++; + f1 = *f++; + f0 += 1175805; + f1 *= 3; + f0 += f1 << 9; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + *c++ = f0; + } +} + +void rq_decoderounded(modq *f,const unsigned char *c) +{ + crypto_uint32 c0, c1, c2, c3; + crypto_uint32 f0, f1, f2; + int i; + + for (i = 0;i < 248;i += 8) { + __m256i abcdefgh, todo[2]; + __m256d x, f2, f1, f0; + __m128i if2, if1, if0; + int j; + + abcdefgh = _mm256_loadu_si256((__m256i *) c); + c += 32; + + todo[0] = _mm256_unpacklo_epi32(abcdefgh,alpha_top); + todo[1] = _mm256_unpackhi_epi32(abcdefgh,alpha_top); + + for (j = 0;j < 2;++j) { + x = *(__m256d *) &todo[j]; + x -= alpha; + + /* x is f0 + f1*1536 + f2*1536^2 */ + /* with each f between 0 and 1530 */ + + f2 = x * _mm256_set1_pd(0.00000042385525173611114052197733521876177320564238470979034900665283203125); + f2 = floor(f2); + x -= f2 * _mm256_set1_pd(2359296.0); + + f1 = x * _mm256_set1_pd(0.00065104166666666673894681149903362893383018672466278076171875); + f1 = floor(f1); + x -= f1 * _mm256_set1_pd(1536.0); + + f0 = x; + + f2 -= _mm256_set1_pd(1531.0) * floor(f2 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875)); + f1 -= _mm256_set1_pd(1531.0) * floor(f1 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875)); + f0 -= _mm256_set1_pd(1531.0) * floor(f0 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875)); + + f2 *= _mm256_set1_pd(3.0); f2 -= _mm256_set1_pd(2295.0); + f1 *= _mm256_set1_pd(3.0); f1 -= _mm256_set1_pd(2295.0); + f0 *= _mm256_set1_pd(3.0); f0 -= _mm256_set1_pd(2295.0); + + if2 = _mm256_cvtpd_epi32(f2); /* a2 b2 e2 f2 */ + if1 = _mm256_cvtpd_epi32(f1); /* a1 b1 e1 f1 */ + if0 = _mm256_cvtpd_epi32(f0); /* a0 b0 e0 f0 */ + + f[6*j + 0] = _mm_extract_epi32(if0,0); + f[6*j + 1] = _mm_extract_epi32(if1,0); + f[6*j + 2] = _mm_extract_epi32(if2,0); + f[6*j + 3] = _mm_extract_epi32(if0,1); + f[6*j + 4] = _mm_extract_epi32(if1,1); + f[6*j + 5] = _mm_extract_epi32(if2,1); + + f[6*j + 12] = _mm_extract_epi32(if0,2); + f[6*j + 13] = _mm_extract_epi32(if1,2); + f[6*j + 14] = _mm_extract_epi32(if2,2); + f[6*j + 15] = _mm_extract_epi32(if0,3); + f[6*j + 16] = _mm_extract_epi32(if1,3); + f[6*j + 17] = _mm_extract_epi32(if2,3); + } + + f += 24; + } + + for (i = 248;i < 253;++i) { + c0 = *c++; + c1 = *c++; + c2 = *c++; + c3 = *c++; + + /* f0 + f1*1536 + f2*1536^2 */ + /* = c0 + c1*256 + c2*256^2 + c3*256^3 */ + /* with each f between 0 and 1530 */ + + /* f2 = (64/9)c3 + (1/36)c2 + (1/9216)c1 + (1/2359296)c0 - [0,0.99675] */ + /* claim: 2^21 f2 < x < 2^21(f2+1) */ + /* where x = 14913081*c3 + 58254*c2 + 228*(c1+2) */ + /* proof: x - 2^21 f2 = 456 - (8/9)c0 + (4/9)c1 - (2/9)c2 + (1/9)c3 + 2^21 [0,0.99675] */ + /* at least 456 - (8/9)255 - (2/9)255 > 0 */ + /* at most 456 + (4/9)255 + (1/9)255 + 2^21 0.99675 < 2^21 */ + f2 = (14913081*c3 + 58254*c2 + 228*(c1+2)) >> 21; + + c2 += c3 << 8; + c2 -= (f2 * 9) << 2; + /* f0 + f1*1536 */ + /* = c0 + c1*256 + c2*256^2 */ + /* c2 <= 35 = floor((1530+1530*1536)/256^2) */ + /* f1 = (128/3)c2 + (1/6)c1 + (1/1536)c0 - (1/1536)f0 */ + /* claim: 2^21 f1 < x < 2^21(f1+1) */ + /* where x = 89478485*c2 + 349525*c1 + 1365*(c0+1) */ + /* proof: x - 2^21 f1 = 1365 - (1/3)c2 - (1/3)c1 - (1/3)c0 + (4096/3)f0 */ + /* at least 1365 - (1/3)35 - (1/3)255 - (1/3)255 > 0 */ + /* at most 1365 + (4096/3)1530 < 2^21 */ + f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21; + + c1 += c2 << 8; + c1 -= (f1 * 3) << 1; + + c0 += c1 << 8; + f0 = c0; + + *f++ = modq_freeze(f0 * 3 + q - qshift); + *f++ = modq_freeze(f1 * 3 + q - qshift); + *f++ = modq_freeze(f2 * 3 + q - qshift); + } + + c0 = *c++; + c1 = *c++; + c2 = *c++; + + f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21; + + c1 += c2 << 8; + c1 -= (f1 * 3) << 1; + + c0 += c1 << 8; + f0 = c0; + + *f++ = modq_freeze(f0 * 3 + q - qshift); + *f++ = modq_freeze(f1 * 3 + q - qshift); + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_top.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_top.c new file mode 100644 index 000000000..69f0bac51 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/rq_top.c @@ -0,0 +1,17 @@ +#include "rq.h" + +void rq_top(unsigned char *c,const modq *f,const unsigned char *r) +{ + modq T[256]; + int i; + + for (i = 0;i < 256;++i) { + modq x = f[i]; + x = modq_sum(x,2295 * (1 & (r[i / 8] >> (i & 7)))); + x = ((x + 2156) * 114 + 16384) >> 15; + T[i] = x; /* between 0 and 15 */ + } + + for (i = 0;i < 128;++i) + *c++ = T[2*i] + (T[2*i + 1] << 4); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.c new file mode 100644 index 000000000..04142baec --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.c @@ -0,0 +1,44 @@ +#include "params.h" +#include "small.h" + +/* XXX: these functions rely on p mod 4 = 1 */ + +/* all coefficients in -1, 0, 1 */ +void small_encode(unsigned char *c,const small *f) +{ + small c0; + int i; + + for (i = 0;i < p/4;++i) { + c0 = *f++ + 1; + c0 += (*f++ + 1) << 2; + c0 += (*f++ + 1) << 4; + c0 += (*f++ + 1) << 6; + *c++ = c0; + } + c0 = *f++ + 1; + *c++ = c0; +} + +void small_decode(small *f,const unsigned char *c) +{ + unsigned char c0; + int i; + + for (i = 0;i < p/4;++i) { + c0 = *c++; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; + } + c0 = *c++; + *f++ = ((small) (c0 & 3)) - 1; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.h new file mode 100644 index 000000000..0ad884fb5 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/avx/small.h @@ -0,0 +1,27 @@ +#ifndef small_h +#define small_h + +#include "crypto_int8.h" +#include "crypto_int32.h" + +typedef crypto_int8 small; + +#define small_encode crypto_kem_ntrulpr4591761_avx_small_encode +extern void small_encode(unsigned char *,const small *); + +#define small_decode crypto_kem_ntrulpr4591761_avx_small_decode +extern void small_decode(small *,const unsigned char *); + +#define small_random32 crypto_kem_ntrulpr4591761_avx_small_random32 +extern crypto_int32 small_random32(void); + +#define small_random crypto_kem_ntrulpr4591761_avx_small_random +extern void small_random(small *); + +#define small_seeded_weightw crypto_kem_ntrulpr4591761_avx_small_seeded_weightw +extern void small_seeded_weightw(small *,const unsigned char *); + +#define small_random_weightw crypto_kem_ntrulpr4591761_avx_small_random_weightw +extern void small_random_weightw(small *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumbig b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumbig new file mode 100644 index 000000000..083f09ae0 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumbig @@ -0,0 +1 @@ +bcc60c85ac6ca2dbbe244878ba9b62019560516e8377aecd890c737bf5dcb05f diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumsmall b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumsmall new file mode 100644 index 000000000..fdb8b27bb --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/checksumsmall @@ -0,0 +1 @@ +a13b63e4929ab2ab97f7889f071245113ddd919bdaf1c883e12cd80fdf4f9e3e diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/description b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/description new file mode 100644 index 000000000..7827a166d --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/description @@ -0,0 +1 @@ +Streamlined NTRU Prime 4591^761 diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/designers b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/designers new file mode 100644 index 000000000..51ac31ea2 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/designers @@ -0,0 +1,5 @@ +Alphabetical order: +Daniel J. Bernstein +Chitchanok Chuengsatiansup +Tanja Lange +Christine van Vredendaal diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/README b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/README new file mode 100644 index 000000000..a6fba2106 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/README @@ -0,0 +1,30 @@ +This is a reference implementation of NTRU LPRime 4591^761. This +implementation is designed primarily for clarity, subject to the +following constraints: + + * The implementation is written in C. We have a separate Sage + implementation that is considerably more concise. + + * The implementation avoids data-dependent branches and array + indices. For example, conditional swaps are computed by arithmetic + rather than by branches. + + * The implementation avoids other C operations that often take + variable time. For example, divisions by 3 are computed via + multiplications and shifts. + +This implementation does _not_ sacrifice clarity for speed. + +This implementation has not yet been reviewed for correctness or for +constant-time behavior. It does pass various tests and has no known +bugs, but there are at least some platforms where multiplications take +variable time, and fixing this requires platform-specific effort; see +https://www.bearssl.org/ctmul.html and http://repository.tue.nl/800603. + +This implementation allows "benign malleability" of ciphertexts, as +defined in http://www.shoup.net/papers/iso-1_1.pdf. A similar comment +applies to public keys. + +There is a separate "avx" implementation where similar comments apply, +except that "avx" _does_ sacrifice clarity for speed on CPUs with AVX2 +instructions. diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/api.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/api.h new file mode 100644 index 000000000..593d7eb48 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/api.h @@ -0,0 +1,4 @@ +#define CRYPTO_SECRETKEYBYTES 1238 +#define CRYPTO_PUBLICKEYBYTES 1047 +#define CRYPTO_CIPHERTEXTBYTES 1175 +#define CRYPTO_BYTES 32 diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/dec.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/dec.c new file mode 100644 index 000000000..72be80350 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/dec.c @@ -0,0 +1,68 @@ +#ifdef KAT +#include +#endif + +#include "params.h" +#include "small.h" +#include "rq.h" +#include "hide.h" +#include "crypto_kem.h" + +static int verify(const unsigned char *x,const unsigned char *y) +{ + unsigned int differentbits = 0; + int i; + for (i = 0;i < crypto_kem_CIPHERTEXTBYTES;++i) + differentbits |= x[i] ^ y[i]; + return (1 & ((differentbits - 1) >> 8)) - 1; +} + +int crypto_kem_dec( + unsigned char *k, + const unsigned char *cstr, + const unsigned char *sk +) +{ + small a[p]; + modq B[p]; + modq aB[p]; + modq C[256]; + unsigned char r[32]; + unsigned char checkcstr[crypto_kem_CIPHERTEXTBYTES]; + unsigned char maybek[32]; + int i; + int result; + + small_decode(a,sk); sk += small_encode_len; + rq_decoderounded(B,cstr + 32); + rq_mult(aB,B,a); + + for (i = 0;i < 128;++i) { + crypto_uint32 x = cstr[32 + rq_encoderounded_len + i]; + C[2*i] = (x & 15) * 287 - 2007; + C[2*i+1] = (x >> 4) * 287 - 2007; + } + + for (i = 0;i < 256;++i) + C[i] = -(modq_freeze(C[i] - aB[i] + 4*w+1) >> 14); + + for (i = 0;i < 32;++i) r[i] = 0; + for (i = 0;i < 256;++i) + r[i / 8] |= (C[i] << (i & 7)); + +#ifdef KAT + { + int j; + printf("decrypt r: "); + for (j = 0;j < 32;++j) + printf("%02x",255 & (int) r[j]); + printf("\n"); + } +#endif + + hide(checkcstr,maybek,sk,r); + result = verify(cstr,checkcstr); + + for (i = 0;i < 32;++i) k[i] = maybek[i] & ~result; + return result; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/enc.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/enc.c new file mode 100644 index 000000000..d9790fcb9 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/enc.c @@ -0,0 +1,30 @@ +#ifdef KAT +#include +#endif + +#include "hide.h" +#include "randombytes.h" +#include "crypto_kem.h" + +int crypto_kem_enc( + unsigned char *cstr, + unsigned char *k, + const unsigned char *pk +) +{ + unsigned char r[32]; + randombytes(r,32); + +#ifdef KAT + { + int i; + printf("encrypt r: "); + for (i = 0;i < 32;++i) + printf("%02x",255 & (int) r[i]); + printf("\n"); + } +#endif + + hide(cstr,k,pk,r); + return 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.c new file mode 100644 index 000000000..83ef2bae8 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.c @@ -0,0 +1,49 @@ +#include +#include +#include "crypto_hash_sha512.h" +#include "crypto_kem.h" +#include "params.h" +#include "rq.h" +#include "hide.h" + +#if crypto_kem_CIPHERTEXTBYTES != rq_encoderounded_len + 32 + 128 +#error "crypto_kem_CIPHERTEXTBYTES must match rq_encoderounded_len + 32 + 128" +#endif + +void hide(unsigned char *cstr,unsigned char *k,const unsigned char *pk,const unsigned char *r) +{ + modq G[p]; + modq A[p]; + unsigned char k12[64]; + unsigned char k34[64]; + small b[p]; + modq B[p]; + modq C[p]; + int i; + + rq_fromseed(G,pk); + rq_decoderounded(A,pk + 32); + + crypto_hash_sha512(k12,r,32); + small_seeded_weightw(b,k12); + crypto_hash_sha512(k34,k12 + 32,32); + + rq_mult(B,G,b); + rq_round3(B,B); + + rq_mult(C,A,b); + for (i = 0;i < 256;++i) { + modq x = C[i]; + x = modq_sum(x,2295 * (1 & (r[i / 8] >> (i & 7)))); + x = ((x + 2156) * 114 + 16384) >> 15; + C[i] = x; /* between 0 and 15 */ + } + + memcpy(cstr,k34,32); cstr += 32; + memcpy(k,k34 + 32,32); + + rq_encoderounded(cstr,B); cstr += rq_encoderounded_len; + + for (i = 0;i < 128;++i) + *cstr++ = C[2*i] + (C[2*i + 1] << 4); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.h new file mode 100644 index 000000000..989c1a0f9 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/hide.h @@ -0,0 +1,9 @@ +#ifndef hide_h +#define hide_h + +#include "crypto_int32.h" + +#define hide crypto_kem_ntrulpr4591761_ref_hide +extern void hide(unsigned char *,unsigned char *,const unsigned char *,const unsigned char *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/implementors b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/implementors new file mode 100644 index 000000000..51ac31ea2 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/implementors @@ -0,0 +1,5 @@ +Alphabetical order: +Daniel J. Bernstein +Chitchanok Chuengsatiansup +Tanja Lange +Christine van Vredendaal diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.c new file mode 100644 index 000000000..f24441108 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.c @@ -0,0 +1,35 @@ +#include "int32_sort.h" +#include "crypto_uint32.h" + +static void minmax(crypto_int32 *x,crypto_int32 *y) +{ + crypto_uint32 xi = *x; + crypto_uint32 yi = *y; + crypto_uint32 xy = xi ^ yi; + crypto_uint32 c = yi - xi; + c ^= xy & (c ^ yi); + c >>= 31; + c = -c; + c &= xy; + *x = xi ^ c; + *y = yi ^ c; +} + +void int32_sort(crypto_int32 *x,int n) +{ + int top,p,q,i; + + if (n < 2) return; + top = 1; + while (top < n - top) top += top; + + for (p = top;p > 0;p >>= 1) { + for (i = 0;i < n - p;++i) + if (!(i & p)) + minmax(x + i,x + i + p); + for (q = top;q > p;q >>= 1) + for (i = 0;i < n - q;++i) + if (!(i & p)) + minmax(x + i + p,x + i + q); + } +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.h new file mode 100644 index 000000000..51c5751d6 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef int32_sort_h +#define int32_sort_h + +#include "crypto_int32.h" + +#define int32_sort crypto_kem_ntrulpr4591761_ref_int32_sort +extern void int32_sort(crypto_int32 *,int); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/keypair.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/keypair.c new file mode 100644 index 000000000..310973391 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/keypair.c @@ -0,0 +1,39 @@ +#include +#include "modq.h" +#include "params.h" +#include "small.h" +#include "rq.h" +#include "crypto_kem.h" +#include "randombytes.h" +#include "crypto_stream_aes256ctr.h" + +#if crypto_kem_PUBLICKEYBYTES != rq_encoderounded_len + 32 +#error "crypto_kem_PUBLICKEYBYTES must match rq_encoderounded_len + 32" +#endif +#if crypto_kem_SECRETKEYBYTES != small_encode_len + crypto_kem_PUBLICKEYBYTES +#error "crypto_kem_SECRETKEYBYTES must match small_encode_len + crypto_kem_PUBLICKEYBYTES" +#endif + +int crypto_kem_keypair(unsigned char *pk,unsigned char *sk) +{ + unsigned char K[32]; + modq G[p]; + small a[p]; + modq A[p]; + + randombytes(K,32); + rq_fromseed(G,K); + + small_random_weightw(a); + + rq_mult(A,G,a); + rq_round3(A,A); + + memcpy(pk,K,32); + rq_encoderounded(pk + 32,A); + + small_encode(sk,a); + memcpy(sk + small_encode_len,pk,crypto_kem_PUBLICKEYBYTES); + + return 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/modq.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/modq.h new file mode 100644 index 000000000..7e13857ea --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/modq.h @@ -0,0 +1,44 @@ +#ifndef modq_h +#define modq_h + +#include "crypto_int16.h" +#include "crypto_int32.h" +#include "crypto_uint16.h" +#include "crypto_uint32.h" + +typedef crypto_int16 modq; + +/* input between -9000000 and 9000000 */ +/* output between -2295 and 2295 */ +static inline modq modq_freeze(crypto_int32 a) +{ + a -= 4591 * ((228 * a) >> 20); + a -= 4591 * ((58470 * a + 134217728) >> 28); + return a; +} + +/* input between 0 and 4294967295 */ +/* output = (input % 4591) - 2295 */ +static inline modq modq_fromuint32(crypto_uint32 a) +{ + crypto_int32 r; + r = (a & 524287) + (a >> 19) * 914; /* <= 8010861 */ + return modq_freeze(r - 2295); +} + +static inline modq modq_plusproduct(modq a,modq b,modq c) +{ + crypto_int32 A = a; + crypto_int32 B = b; + crypto_int32 C = c; + return modq_freeze(A + B * C); +} + +static inline modq modq_sum(modq a,modq b) +{ + crypto_int32 A = a; + crypto_int32 B = b; + return modq_freeze(A + B); +} + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/params.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/params.h new file mode 100644 index 000000000..8c0d74f8b --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/params.h @@ -0,0 +1,15 @@ +#ifndef params_h +#define params_h + +#define q 4591 +/* XXX: also built into modq in various ways */ + +#define qshift 2295 +#define p 761 +#define w 250 + +#define rq_encode_len 1218 +#define rq_encoderounded_len 1015 +#define small_encode_len 191 + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/randomweightw.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/randomweightw.c new file mode 100644 index 000000000..f01d96c47 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/randomweightw.c @@ -0,0 +1,28 @@ +#include "params.h" +#include "randombytes.h" +#include "int32_sort.h" +#include "small.h" +#include "crypto_stream_aes256ctr.h" + +static const unsigned char n[16] = {0}; + +void small_seeded_weightw(small *f,const unsigned char *k) +{ + crypto_int32 r[p]; + int i; + + crypto_stream_aes256ctr((unsigned char *) r,sizeof r,n,k); + for (i = 0;i < p;++i) r[i] ^= 0x80000000; + + for (i = 0;i < w;++i) r[i] &= -2; + for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1; + int32_sort(r,p); + for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1; +} + +void small_random_weightw(small *f) +{ + unsigned char k[32]; + randombytes(k,32); + small_seeded_weightw(f,k); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq.h new file mode 100644 index 000000000..2970eeb61 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq.h @@ -0,0 +1,31 @@ +#ifndef rq_h +#define rq_h + +#include "modq.h" +#include "small.h" + +#define rq_encode crypto_kem_ntrulpr4591761_ref_rq_encode +extern void rq_encode(unsigned char *,const modq *); + +#define rq_decode crypto_kem_ntrulpr4591761_ref_rq_decode +extern void rq_decode(modq *,const unsigned char *); + +#define rq_encoderounded crypto_kem_ntrulpr4591761_ref_rq_encoderounded +extern void rq_encoderounded(unsigned char *,const modq *); + +#define rq_decoderounded crypto_kem_ntrulpr4591761_ref_rq_decoderounded +extern void rq_decoderounded(modq *,const unsigned char *); + +#define rq_round3 crypto_kem_ntrulpr4591761_ref_rq_round +extern void rq_round3(modq *,const modq *); + +#define rq_mult crypto_kem_ntrulpr4591761_ref_rq_mult +extern void rq_mult(modq *,const modq *,const small *); + +#define rq_recip3 crypto_kem_ntrulpr4591761_ref_rq_recip3 +int rq_recip3(modq *,const small *); + +#define rq_fromseed crypto_kem_ntrulpr4591761_ref_rq_fromseed +extern void rq_fromseed(modq *,const unsigned char *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_fromseed.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_fromseed.c new file mode 100644 index 000000000..c78b37417 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_fromseed.c @@ -0,0 +1,15 @@ +#include "crypto_stream_aes256ctr.h" +#include "rq.h" +#include "params.h" + +static const unsigned char n[16] = {0}; + +void rq_fromseed(modq *h,const unsigned char *K) +{ + crypto_uint32 buf[p]; + int i; + + crypto_stream_aes256ctr((unsigned char *) buf,sizeof buf,n,K); + for (i = 0;i < p;++i) + h[i] = modq_fromuint32(buf[i]); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_mult.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_mult.c new file mode 100644 index 000000000..86dc7da03 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_mult.c @@ -0,0 +1,30 @@ +#include "params.h" +#include "rq.h" + +void rq_mult(modq *h,const modq *f,const small *g) +{ + modq fg[p + p - 1]; + modq result; + int i, j; + + for (i = 0;i < p;++i) { + result = 0; + for (j = 0;j <= i;++j) + result = modq_plusproduct(result,f[j],g[i - j]); + fg[i] = result; + } + for (i = p;i < p + p - 1;++i) { + result = 0; + for (j = i - p + 1;j < p;++j) + result = modq_plusproduct(result,f[j],g[i - j]); + fg[i] = result; + } + + for (i = p + p - 2;i >= p;--i) { + fg[i - p] = modq_sum(fg[i - p],fg[i]); + fg[i - p + 1] = modq_sum(fg[i - p + 1],fg[i]); + } + + for (i = 0;i < p;++i) + h[i] = fg[i]; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_round3.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_round3.c new file mode 100644 index 000000000..c972e8e4e --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_round3.c @@ -0,0 +1,10 @@ +#include "params.h" +#include "rq.h" + +void rq_round3(modq *h,const modq *f) +{ + int i; + + for (i = 0;i < p;++i) + h[i] = ((21846 * (f[i] + 2295) + 32768) >> 16) * 3 - 2295; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_rounded.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_rounded.c new file mode 100644 index 000000000..04c75f324 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/rq_rounded.c @@ -0,0 +1,101 @@ +#include "params.h" +#include "crypto_uint32.h" +#include "rq.h" + +void rq_encoderounded(unsigned char *c,const modq *f) +{ + crypto_int32 f0, f1, f2; + int i; + + for (i = 0;i < p/3;++i) { + f0 = *f++ + qshift; + f1 = *f++ + qshift; + f2 = *f++ + qshift; + f0 = (21846 * f0) >> 16; + f1 = (21846 * f1) >> 16; + f2 = (21846 * f2) >> 16; + /* now want f0 + f1*1536 + f2*1536^2 as a 32-bit integer */ + f2 *= 3; + f1 += f2 << 9; + f1 *= 3; + f0 += f1 << 9; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + *c++ = f0; + } + /* XXX: using p mod 3 = 2 */ + f0 = *f++ + qshift; + f1 = *f++ + qshift; + f0 = (21846 * f0) >> 16; + f1 = (21846 * f1) >> 16; + f1 *= 3; + f0 += f1 << 9; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + *c++ = f0; +} + +void rq_decoderounded(modq *f,const unsigned char *c) +{ + crypto_uint32 c0, c1, c2, c3; + crypto_uint32 f0, f1, f2; + int i; + + for (i = 0;i < p/3;++i) { + c0 = *c++; + c1 = *c++; + c2 = *c++; + c3 = *c++; + + /* f0 + f1*1536 + f2*1536^2 */ + /* = c0 + c1*256 + c2*256^2 + c3*256^3 */ + /* with each f between 0 and 1530 */ + + /* f2 = (64/9)c3 + (1/36)c2 + (1/9216)c1 + (1/2359296)c0 - [0,0.99675] */ + /* claim: 2^21 f2 < x < 2^21(f2+1) */ + /* where x = 14913081*c3 + 58254*c2 + 228*(c1+2) */ + /* proof: x - 2^21 f2 = 456 - (8/9)c0 + (4/9)c1 - (2/9)c2 + (1/9)c3 + 2^21 [0,0.99675] */ + /* at least 456 - (8/9)255 - (2/9)255 > 0 */ + /* at most 456 + (4/9)255 + (1/9)255 + 2^21 0.99675 < 2^21 */ + f2 = (14913081*c3 + 58254*c2 + 228*(c1+2)) >> 21; + + c2 += c3 << 8; + c2 -= (f2 * 9) << 2; + /* f0 + f1*1536 */ + /* = c0 + c1*256 + c2*256^2 */ + /* c2 <= 35 = floor((1530+1530*1536)/256^2) */ + /* f1 = (128/3)c2 + (1/6)c1 + (1/1536)c0 - (1/1536)f0 */ + /* claim: 2^21 f1 < x < 2^21(f1+1) */ + /* where x = 89478485*c2 + 349525*c1 + 1365*(c0+1) */ + /* proof: x - 2^21 f1 = 1365 - (1/3)c2 - (1/3)c1 - (1/3)c0 + (4096/3)f0 */ + /* at least 1365 - (1/3)35 - (1/3)255 - (1/3)255 > 0 */ + /* at most 1365 + (4096/3)1530 < 2^21 */ + f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21; + + c1 += c2 << 8; + c1 -= (f1 * 3) << 1; + + c0 += c1 << 8; + f0 = c0; + + *f++ = modq_freeze(f0 * 3 + q - qshift); + *f++ = modq_freeze(f1 * 3 + q - qshift); + *f++ = modq_freeze(f2 * 3 + q - qshift); + } + + c0 = *c++; + c1 = *c++; + c2 = *c++; + + f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21; + + c1 += c2 << 8; + c1 -= (f1 * 3) << 1; + + c0 += c1 << 8; + f0 = c0; + + *f++ = modq_freeze(f0 * 3 + q - qshift); + *f++ = modq_freeze(f1 * 3 + q - qshift); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.c b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.c new file mode 100644 index 000000000..270dcbe28 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.c @@ -0,0 +1,37 @@ +#include "params.h" +#include "small.h" + +/* XXX: these functions rely on p mod 4 = 1 */ + +/* all coefficients in -1, 0, 1 */ +void small_encode(unsigned char *c,const small *f) +{ + small c0; + int i; + + for (i = 0;i < p/4;++i) { + c0 = *f++ + 1; + c0 += (*f++ + 1) << 2; + c0 += (*f++ + 1) << 4; + c0 += (*f++ + 1) << 6; + *c++ = c0; + } + c0 = *f++ + 1; + *c++ = c0; +} + +void small_decode(small *f,const unsigned char *c) +{ + unsigned char c0; + int i; + + for (i = 0;i < p/4;++i) { + c0 = *c++; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; + } + c0 = *c++; + *f++ = ((small) (c0 & 3)) - 1; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.h b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.h new file mode 100644 index 000000000..664ab8593 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/ntrulpr4591761/ref/small.h @@ -0,0 +1,27 @@ +#ifndef small_h +#define small_h + +#include "crypto_int8.h" +#include "crypto_int32.h" + +typedef crypto_int8 small; + +#define small_encode crypto_kem_ntrulpr4591761_ref_small_encode +extern void small_encode(unsigned char *,const small *); + +#define small_decode crypto_kem_ntrulpr4591761_ref_small_decode +extern void small_decode(small *,const unsigned char *); + +#define small_random32 crypto_kem_ntrulpr4591761_ref_small_random32 +extern crypto_int32 small_random32(void); + +#define small_random crypto_kem_ntrulpr4591761_ref_small_random +extern void small_random(small *); + +#define small_seeded_weightw crypto_kem_ntrulpr4591761_ref_small_seeded_weightw +extern void small_seeded_weightw(small *,const unsigned char *); + +#define small_random_weightw crypto_kem_ntrulpr4591761_ref_small_random_weightw +extern void small_random_weightw(small *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/api.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/api.h new file mode 100644 index 000000000..94d75538b --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/api.h @@ -0,0 +1,4 @@ +#define CRYPTO_SECRETKEYBYTES 1600 +#define CRYPTO_PUBLICKEYBYTES 1218 +#define CRYPTO_CIPHERTEXTBYTES 1047 +#define CRYPTO_BYTES 32 diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/dec.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/dec.c new file mode 100644 index 000000000..2c3226d50 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/dec.c @@ -0,0 +1,67 @@ +#ifdef KAT +#include +#endif + +#include "params.h" +#include "small.h" +#include "mod3.h" +#include "rq.h" +#include "r3.h" +#include "crypto_hash_sha512.h" +#include "crypto_verify_32.h" +#include "crypto_kem.h" + +int crypto_kem_dec( + unsigned char *k, + const unsigned char *cstr, + const unsigned char *sk +) +{ + small f[768]; + modq h[768]; + small grecip[768]; + modq c[768]; + modq t[768]; + small t3[768]; + small r[768]; + modq hr[768]; + unsigned char rstr[small_encode_len]; + unsigned char hash[64]; + int i; + int result = 0; + + small_decode(f,sk); + small_decode(grecip,sk + small_encode_len); + rq_decode(h,sk + 2 * small_encode_len); + + rq_decoderounded(c,cstr + 32); + + rq_mult(t,c,f); + rq_mod3(t3,t); + + r3_mult(r,t3,grecip); + +#ifdef KAT + { + int j; + printf("decrypt r:"); + for (j = 0;j < p;++j) + if (r[j] == 1) printf(" +%d",j); + else if (r[j] == -1) printf(" -%d",j); + printf("\n"); + } +#endif + + result |= r3_weightw_mask(r); + + rq_mult(hr,h,r); + rq_round3(hr,hr); + for (i = 0;i < p;++i) result |= modq_nonzero_mask(hr[i] - c[i]); + + small_encode(rstr,r); + crypto_hash_sha512(hash,rstr,sizeof rstr); + result |= crypto_verify_32(hash,cstr); + + for (i = 0;i < 32;++i) k[i] = (hash[32 + i] & ~result); + return result; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/enc.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/enc.c new file mode 100644 index 000000000..9ed4b99df --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/enc.c @@ -0,0 +1,48 @@ +#ifdef KAT +#include +#endif + +#include +#include "params.h" +#include "small.h" +#include "rq.h" +#include "crypto_hash_sha512.h" +#include "crypto_kem.h" + +int crypto_kem_enc( + unsigned char *cstr, + unsigned char *k, + const unsigned char *pk +) +{ + small r[768]; + modq h[768]; + modq c[768]; + unsigned char rstr[small_encode_len]; + unsigned char hash[64]; + + small_random_weightw(r); + +#ifdef KAT + { + int i; + printf("encrypt r:"); + for (i = 0;i < p;++i) + if (r[i] == 1) printf(" +%d",i); + else if (r[i] == -1) printf(" -%d",i); + printf("\n"); + } +#endif + + small_encode(rstr,r); + crypto_hash_sha512(hash,rstr,sizeof rstr); + + rq_decode(h,pk); + rq_mult(c,h,r); + + memcpy(k,hash + 32,32); + memcpy(cstr,hash,32); + rq_roundencode(cstr + 32,c); + + return 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/implementors b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/implementors new file mode 100644 index 000000000..51ac31ea2 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/implementors @@ -0,0 +1,5 @@ +Alphabetical order: +Daniel J. Bernstein +Chitchanok Chuengsatiansup +Tanja Lange +Christine van Vredendaal diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.c new file mode 100644 index 000000000..e950efe6a --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.c @@ -0,0 +1,425 @@ +#include "int32_sort.h" +#include + +typedef crypto_int32 int32; + +static inline void minmax(int32 *x,int32 *y) +{ + asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg %%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)" + : : "r"(x),"r"(y) : "%eax","%ebx","%edx"); +} + +/* sort x0,x2; sort x1,x3; ... sort x13, x15 */ +static inline void minmax02through1315(int32 *x) +{ + __m256i a = _mm256_loadu_si256((__m256i *) x); + __m256i b = _mm256_loadu_si256((__m256i *) (x + 8)); + __m256i c = _mm256_unpacklo_epi64(a,b); /* a01b01a45b45 */ + __m256i d = _mm256_unpackhi_epi64(a,b); /* a23b23a67b67 */ + __m256i g = _mm256_min_epi32(c,d); + __m256i h = _mm256_max_epi32(c,d); + a = _mm256_unpacklo_epi64(g,h); + b = _mm256_unpackhi_epi64(g,h); + _mm256_storeu_si256((__m256i *) x,a); + _mm256_storeu_si256((__m256i *) (x + 8),b); +} + +/* sort x0,x2; sort x1,x3; sort x4,x6; sort x5,x7 */ +static inline void minmax02134657(int32 *x) +{ + __m256i a = _mm256_loadu_si256((__m256i *) x); + __m256i b = _mm256_shuffle_epi32(a,0x4e); + __m256i c = _mm256_cmpgt_epi32(a,b); + c = _mm256_shuffle_epi32(c,0x44); + __m256i abc = c & (a ^ b); + a ^= abc; + _mm256_storeu_si256((__m256i *) x,a); +} + +static void multiminmax2plus2( + int32 *x, + int n) +{ + while (n >= 16) { + minmax02through1315(x); + n -= 16; + x += 16; + } + if (n >= 8) { + minmax02134657(x); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + 2); + minmax(x + 1,x + 3); + n -= 4; + x += 4; + } + if (n > 0) { + minmax(x,x + 2); + if (n > 1) minmax(x + 1,x + 3); + } +} + +static void multiminmax2plus6( + int32 *x, + int n) +{ + while (n >= 4) { + minmax(x,x + 6); + minmax(x + 1,x + 7); + n -= 4; + x += 4; + } + if (n > 0) { + minmax(x,x + 6); + if (n > 1) minmax(x + 1,x + 7); + } +} + +static void multiminmax2plus14( + int32 *x, + int n) +{ + while (n >= 8) { + minmax(x,x + 14); + minmax(x + 1,x + 15); + minmax(x + 4,x + 18); + minmax(x + 5,x + 19); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + 14); + minmax(x + 1,x + 15); + n -= 4; + x += 4; + } + if (n > 0) { + minmax(x,x + 14); + if (n > 1) minmax(x + 1,x + 15); + } +} + +/* sort x[i],y[i] for i in 0,1,4,5,8,9,12,13 */ +/* all of x0...x15 and y0...y15 must exist; no aliasing */ +static inline void minmax0145891213(int32 *x,int32 *y) +{ + __m256i a01234567 = _mm256_loadu_si256((__m256i *) x); + __m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8)); + __m256i b01234567 = _mm256_loadu_si256((__m256i *) y); + __m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8)); + + __m256i a0189451213 = _mm256_unpacklo_epi64(a01234567,a89101112131415); + __m256i b0189451213 = _mm256_unpacklo_epi64(b01234567,b89101112131415); + __m256i c0189451213 = _mm256_min_epi32(a0189451213,b0189451213); + __m256i d0189451213 = _mm256_max_epi32(a0189451213,b0189451213); + + __m256i c01234567 = _mm256_blend_epi32(a01234567,c0189451213,0x33); + __m256i d01234567 = _mm256_blend_epi32(b01234567,d0189451213,0x33); + __m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213,a89101112131415); + __m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213,b89101112131415); + + _mm256_storeu_si256((__m256i *) x,c01234567); + _mm256_storeu_si256((__m256i *) (x + 8),c89101112131415); + _mm256_storeu_si256((__m256i *) y,d01234567); + _mm256_storeu_si256((__m256i *) (y + 8),d89101112131415); +} + +/* offset >= 30 */ +static void multiminmax2plusmore( + int32 *x, + int n, + int offset) +{ + while (n >= 16) { + minmax0145891213(x,x + offset); + n -= 16; + x += 16; + } + if (n >= 8) { + minmax(x,x + offset); + minmax(x + 1,x + 1 + offset); + minmax(x + 4,x + 4 + offset); + minmax(x + 5,x + 5 + offset); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + offset); + minmax(x + 1,x + 1 + offset); + n -= 4; + x += 4; + } + if (n > 0) { + minmax(x,x + offset); + if (n > 1) minmax(x + 1,x + 1 + offset); + } +} + +/* sort x0,x1; ... sort x14, x15 */ +static inline void minmax01through1415(int32 *x) +{ + __m256i a = _mm256_loadu_si256((__m256i *) x); + __m256i b = _mm256_loadu_si256((__m256i *) (x + 8)); + __m256i c = _mm256_unpacklo_epi32(a,b); /* ab0ab1ab4ab5 */ + __m256i d = _mm256_unpackhi_epi32(a,b); /* ab2ab3ab6ab7 */ + __m256i e = _mm256_unpacklo_epi32(c,d); /* a02b02a46b46 */ + __m256i f = _mm256_unpackhi_epi32(c,d); /* a13b13a57b57 */ + __m256i g = _mm256_min_epi32(e,f); /* a02b02a46b46 */ + __m256i h = _mm256_max_epi32(e,f); /* a13b13a57b57 */ + a = _mm256_unpacklo_epi32(g,h); + b = _mm256_unpackhi_epi32(g,h); + _mm256_storeu_si256((__m256i *) x,a); + _mm256_storeu_si256((__m256i *) (x + 8),b); +} + +/* sort x0,x1; sort x2,x3; sort x4,x5; sort x6,x7 */ +static inline void minmax01234567(int32 *x) +{ + __m256i a = _mm256_loadu_si256((__m256i *) x); + __m256i b = _mm256_shuffle_epi32(a,0xb1); + __m256i c = _mm256_cmpgt_epi32(a,b); + c = _mm256_shuffle_epi32(c,0xa0); + __m256i abc = c & (a ^ b); + a ^= abc; + _mm256_storeu_si256((__m256i *) x,a); +} + +static void multiminmax1plus1( + int32 *x, + int n) +{ + while (n >= 16) { + minmax01through1415(x); + n -= 16; + x += 16; + } + if (n >= 8) { + minmax01234567(x); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + 1); + minmax(x + 2,x + 3); + n -= 4; + x += 4; + } + if (n >= 2) { + minmax(x,x + 1); + n -= 2; + x += 2; + } + if (n > 0) + minmax(x,x + 1); +} + +static void multiminmax1( + int32 *x, + int n, + int offset) +{ + while (n >= 16) { + minmax(x,x + offset); + minmax(x + 2,x + 2 + offset); + minmax(x + 4,x + 4 + offset); + minmax(x + 6,x + 6 + offset); + minmax(x + 8,x + 8 + offset); + minmax(x + 10,x + 10 + offset); + minmax(x + 12,x + 12 + offset); + minmax(x + 14,x + 14 + offset); + n -= 16; + x += 16; + } + if (n >= 8) { + minmax(x,x + offset); + minmax(x + 2,x + 2 + offset); + minmax(x + 4,x + 4 + offset); + minmax(x + 6,x + 6 + offset); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + offset); + minmax(x + 2,x + 2 + offset); + n -= 4; + x += 4; + } + if (n >= 2) { + minmax(x,x + offset); + n -= 2; + x += 2; + } + if (n > 0) + minmax(x,x + offset); +} + +/* sort x[i],y[i] for i in 0,2,4,6,8,10,12,14 */ +/* all of x0...x15 and y0...y15 must exist; no aliasing */ +static inline void minmax02468101214(int32 *x,int32 *y) +{ + __m256i a01234567 = _mm256_loadu_si256((__m256i *) x); + __m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8)); + __m256i b01234567 = _mm256_loadu_si256((__m256i *) y); + __m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8)); + + __m256i a0819412513 = _mm256_unpacklo_epi32(a01234567,a89101112131415); + __m256i a210311614715 = _mm256_unpackhi_epi32(a01234567,a89101112131415); + __m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513,a210311614715); + __m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513,a210311614715); + + __m256i b0819412513 = _mm256_unpacklo_epi32(b01234567,b89101112131415); + __m256i b210311614715 = _mm256_unpackhi_epi32(b01234567,b89101112131415); + __m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513,b210311614715); + __m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513,b210311614715); + + __m256i c02810461214 = _mm256_min_epi32(a02810461214,b02810461214); + __m256i d02810461214 = _mm256_max_epi32(a02810461214,b02810461214); + + __m256i c01234567 = _mm256_unpacklo_epi32(c02810461214,a13911571315); + __m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214,a13911571315); + __m256i d01234567 = _mm256_unpacklo_epi32(d02810461214,b13911571315); + __m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214,b13911571315); + + _mm256_storeu_si256((__m256i *) x,c01234567); + _mm256_storeu_si256((__m256i *) (x + 8),c89101112131415); + _mm256_storeu_si256((__m256i *) y,d01234567); + _mm256_storeu_si256((__m256i *) (y + 8),d89101112131415); +} + +/* assumes offset >= 31 */ +static void multiminmax1plusmore( + int32 *x, + int n, + int offset) +{ + while (n >= 16) { + minmax02468101214(x,x + offset); + n -= 16; + x += 16; + } + if (n >= 8) { + minmax(x,x + offset); + minmax(x + 2,x + 2 + offset); + minmax(x + 4,x + 4 + offset); + minmax(x + 6,x + 6 + offset); + n -= 8; + x += 8; + } + if (n >= 4) { + minmax(x,x + offset); + minmax(x + 2,x + 2 + offset); + n -= 4; + x += 4; + } + if (n >= 2) { + minmax(x,x + offset); + n -= 2; + x += 2; + } + if (n > 0) + minmax(x,x + offset); +} + +/* sort x0,y0; sort x1,y1; ...; sort x7,y7 */ +static inline void minmax8(int32 *x,int32 *y) +{ + __m256i a = _mm256_loadu_si256((__m256i *) x); + __m256i b = _mm256_loadu_si256((__m256i *) y); + _mm256_storeu_si256((__m256i *) x,_mm256_min_epi32(a,b)); + _mm256_storeu_si256((__m256i *) y,_mm256_max_epi32(a,b)); +} + +/* assumes p >= 8; implies offset >= 8 */ +static void multiminmax_atleast8(int p, + int32 *x, + int n, + int offset) +{ + int i; + while (n >= 2 * p) { + for (i = 0;i < p;i += 8) + minmax8(x + i,x + i + offset); + n -= 2 * p; + x += 2 * p; + } + for (i = 0;i + 8 <= n;i += 8) { + if (i & p) return; + minmax8(x + i,x + i + offset); + } + for (;i < n;++i) { + if (i & p) return; + minmax(x + i,x + i + offset); + } +} + +/* sort x0,y0; sort x1,y1; sort x2,y2; sort x3,y3 */ +static inline void minmax4(int32 *x,int32 *y) +{ + __m128i a = _mm_loadu_si128((__m128i *) x); + __m128i b = _mm_loadu_si128((__m128i *) y); + _mm_storeu_si128((__m128i *) x,_mm_min_epi32(a,b)); + _mm_storeu_si128((__m128i *) y,_mm_max_epi32(a,b)); +} + +static void multiminmax4( + int32 *x, + int n, + int offset) +{ + int i; + while (n >= 8) { + minmax4(x,x + offset); + n -= 8; + x += 8; + } + if (n >= 4) + minmax4(x,x + offset); + else + for (i = 0;i < n;++i) + minmax(x + i,x + i + offset); +} + +void int32_sort(int32 *x,int n) +{ + int top,p,q; + + if (n < 2) return; + top = 1; + while (top < n - top) top += top; + + for (p = top;p >= 8;p >>= 1) { + multiminmax_atleast8(p,x,n - p,p); + for (q = top;q > p;q >>= 1) + multiminmax_atleast8(p,x + p,n - q,q - p); + } + if (p >= 4) { + multiminmax4(x,n - 4,4); + for (q = top;q > 4;q >>= 1) + multiminmax4(x + 4,n - q,q - 4); + } + if (p >= 2) { + multiminmax2plus2(x,n - 2); + for (q = top;q >= 32;q >>= 1) + multiminmax2plusmore(x + 2,n - q,q - 2); + if (q >= 16) + multiminmax2plus14(x + 2,n - 16); + if (q >= 8) + multiminmax2plus6(x + 2,n - 8); + if (q >= 4) + multiminmax2plus2(x + 2,n - 4); + } + multiminmax1plus1(x,n - 1); + for (q = top;q >= 32;q >>= 1) + multiminmax1plusmore(x + 1,n - q,q - 1); + if (q >= 16) + multiminmax1(x + 1,n - 16,15); + if (q >= 8) + multiminmax1(x + 1,n - 8,7); + if (q >= 4) + multiminmax1(x + 1,n - 4,3); + if (q >= 2) + multiminmax1plus1(x + 1,n - 2); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.h new file mode 100644 index 000000000..b23824178 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef int32_sort_h +#define int32_sort_h + +#include "crypto_int32.h" + +#define int32_sort crypto_kem_sntrup4591761_avx_int32_sort +extern void int32_sort(crypto_int32 *,int); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/keypair.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/keypair.c new file mode 100644 index 000000000..52517dc5c --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/keypair.c @@ -0,0 +1,39 @@ +#include +#include "modq.h" +#include "params.h" +#include "r3.h" +#include "small.h" +#include "rq.h" +#include "crypto_kem.h" + +#if crypto_kem_PUBLICKEYBYTES != rq_encode_len +#error "crypto_kem_PUBLICKEYBYTES must match rq_encode_len" +#endif +#if crypto_kem_SECRETKEYBYTES != rq_encode_len + 2 * small_encode_len +#error "crypto_kem_SECRETKEYBYTES must match rq_encode_len + 2 * small_encode_len" +#endif + +int crypto_kem_keypair(unsigned char *pk,unsigned char *sk) +{ + small g[768]; + small grecip[768]; + small f[768]; + modq f3recip[768]; + modq h[768]; + + do + small_random(g); + while (r3_recip(grecip,g) != 0); + + small_random_weightw(f); + rq_recip3(f3recip,f); + + rq_mult(h,f3recip,g); + + rq_encode(pk,h); + small_encode(sk,f); + small_encode(sk + small_encode_len,grecip); + memcpy(sk + 2 * small_encode_len,pk,rq_encode_len); + + return 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mod3.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mod3.h new file mode 100644 index 000000000..c51f2edd9 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mod3.h @@ -0,0 +1,60 @@ +#ifndef mod3_h +#define mod3_h + +#include "small.h" +#include "crypto_int32.h" + +/* -1 if x is nonzero, 0 otherwise */ +static inline int mod3_nonzero_mask(small x) +{ + return -x*x; +} + +/* input between -100000 and 100000 */ +/* output between -1 and 1 */ +static inline small mod3_freeze(crypto_int32 a) +{ + a -= 3 * ((10923 * a) >> 15); + a -= 3 * ((89478485 * a + 134217728) >> 28); + return a; +} + +static inline small mod3_minusproduct(small a,small b,small c) +{ + crypto_int32 A = a; + crypto_int32 B = b; + crypto_int32 C = c; + return mod3_freeze(A - B * C); +} + +static inline small mod3_plusproduct(small a,small b,small c) +{ + crypto_int32 A = a; + crypto_int32 B = b; + crypto_int32 C = c; + return mod3_freeze(A + B * C); +} + +static inline small mod3_product(small a,small b) +{ + return a * b; +} + +static inline small mod3_sum(small a,small b) +{ + crypto_int32 A = a; + crypto_int32 B = b; + return mod3_freeze(A + B); +} + +static inline small mod3_reciprocal(small a1) +{ + return a1; +} + +static inline small mod3_quotient(small num,small den) +{ + return mod3_product(num,mod3_reciprocal(den)); +} + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/modq.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/modq.h new file mode 100644 index 000000000..c6c7ed398 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/modq.h @@ -0,0 +1,91 @@ +#ifndef modq_h +#define modq_h + +#include "crypto_int16.h" +#include "crypto_int32.h" +#include "crypto_uint16.h" + +typedef crypto_int16 modq; + +/* input between -9000000 and 9000000 */ +/* output between -2295 and 2295 */ +static inline modq modq_freeze(crypto_int32 a) +{ + a -= 4591 * ((228 * a) >> 20); + a -= 4591 * ((58470 * a + 134217728) >> 28); + return a; +} + +static inline modq modq_minusproduct(modq a,modq b,modq c) +{ + crypto_int32 A = a; + crypto_int32 B = b; + crypto_int32 C = c; + return modq_freeze(A - B * C); +} + +static inline modq modq_plusproduct(modq a,modq b,modq c) +{ + crypto_int32 A = a; + crypto_int32 B = b; + crypto_int32 C = c; + return modq_freeze(A + B * C); +} + +static inline modq modq_product(modq a,modq b) +{ + crypto_int32 A = a; + crypto_int32 B = b; + return modq_freeze(A * B); +} + +static inline modq modq_square(modq a) +{ + crypto_int32 A = a; + return modq_freeze(A * A); +} + +static inline modq modq_sum(modq a,modq b) +{ + crypto_int32 A = a; + crypto_int32 B = b; + return modq_freeze(A + B); +} + +static inline modq modq_reciprocal(modq a1) +{ + modq a2 = modq_square(a1); + modq a3 = modq_product(a2,a1); + modq a4 = modq_square(a2); + modq a8 = modq_square(a4); + modq a16 = modq_square(a8); + modq a32 = modq_square(a16); + modq a35 = modq_product(a32,a3); + modq a70 = modq_square(a35); + modq a140 = modq_square(a70); + modq a143 = modq_product(a140,a3); + modq a286 = modq_square(a143); + modq a572 = modq_square(a286); + modq a1144 = modq_square(a572); + modq a1147 = modq_product(a1144,a3); + modq a2294 = modq_square(a1147); + modq a4588 = modq_square(a2294); + modq a4589 = modq_product(a4588,a1); + return a4589; +} + +static inline modq modq_quotient(modq num,modq den) +{ + return modq_product(num,modq_reciprocal(den)); +} + +/* -1 if x is nonzero, 0 otherwise */ +static inline int modq_nonzero_mask(modq x) +{ + crypto_int32 r = (crypto_uint16) x; + r = -r; + r >>= 30; + return r; +} + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mult.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mult.c new file mode 100644 index 000000000..58d77e382 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/mult.c @@ -0,0 +1,762 @@ +#include +#include +#include "mod3.h" +#include "rq.h" +#include "r3.h" + +#define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \ + gj = g[j]; \ + h0 += f0 * gj; \ + _mm256_storeu_ps(&h[i + j],h0); \ + h1 += f1 * gj; \ + h2 += f2 * gj; \ + h3 += f3 * gj; \ + h4 += f4 * gj; \ + h0 = _mm256_loadu_ps(&h[i + j + 5]); \ + h0 += f5 * gj; + +#define MULSTEP_asm(j,h0,h1,h2,h3,h4) \ + gj = g[j]; \ + __asm__( \ + "vfmadd231ps %5,%6,%0 \n\t" \ + "vmovups %0,%12 \n\t" \ + "vmovups %13,%0 \n\t" \ + "vfmadd231ps %5,%7,%1 \n\t" \ + "vfmadd231ps %5,%8,%2 \n\t" \ + "vfmadd231ps %5,%9,%3 \n\t" \ + "vfmadd231ps %5,%10,%4 \n\t" \ + "vfmadd231ps %5,%11,%0 \n\t" \ + : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \ + : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]),"m"(h[i+j+5])); + +#define MULSTEP MULSTEP_asm + +#define MULSTEP_noload(j,h0,h1,h2,h3,h4) \ + gj = g[j]; \ + __asm__( \ + "vfmadd231ps %5,%6,%0 \n\t" \ + "vmovups %0,%12 \n\t" \ + "vfmadd231ps %5,%7,%1 \n\t" \ + "vfmadd231ps %5,%8,%2 \n\t" \ + "vfmadd231ps %5,%9,%3 \n\t" \ + "vfmadd231ps %5,%10,%4 \n\t" \ + "vmulps %5,%11,%0 \n\t" \ + : "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \ + : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j])); + +#define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \ + gj = g[j]; \ + __asm__( \ + "vmulps %5,%6,%0 \n\t" \ + "vmovups %0,%12 \n\t" \ + "vmulps %5,%7,%1 \n\t" \ + "vmulps %5,%8,%2 \n\t" \ + "vmulps %5,%9,%3 \n\t" \ + "vmulps %5,%10,%4 \n\t" \ + "vmulps %5,%11,%0 \n\t" \ + : "=&x"(h0),"=&x"(h1),"=&x"(h2),"=&x"(h3),"=&x"(h4) \ + : "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j])); + +static inline __m128i _mm_load_cvtepi8_epi16(const long long *x) +{ + __m128i result; + __asm__("vpmovsxbw %1, %0" : "=x"(result) : "m"(*x)); + return result; +} + +#define v0 _mm256_set1_epi32(0) +#define v0_128 _mm_set1_epi32(0) +#define v7 _mm256_set1_epi16(7) +#define v4591_16 _mm256_set1_epi16(4591) +#define v2296_16 _mm256_set1_epi16(2296) + +#define alpha_32 _mm256_set1_epi32(0x4b400000) +#define alpha_32_128 _mm_set1_epi32(0x4b400000) +#define alpha_float _mm256_set1_ps(12582912.0) + +#define v0_float _mm256_set1_ps(0) +#define v1_float _mm256_set1_ps(1) +#define vm1_float _mm256_set1_ps(-1) +#define vm4591_float _mm256_set1_ps(-4591) +#define recip4591_float _mm256_set1_ps(0.00021781746896101067305597908952297974298) + +static inline __m256 add(__m256 x,__m256 y) +{ + return x + y; +} + +static inline __m256 fastadd(__m256 x,__m256 y) +{ + return _mm256_fmadd_ps(y,v1_float,x); +} + +static inline __m256 fastsub(__m256 x,__m256 y) +{ + return _mm256_fmadd_ps(y,vm1_float,x); +} + +static inline __m256 reduce(__m256 x) +{ + __m256 q = x * recip4591_float; + q = _mm256_round_ps(q,8); + return _mm256_fmadd_ps(q,vm4591_float,x); +} + +static inline __m256i squeeze(__m256i x) +{ + __m256i q = _mm256_mulhrs_epi16(x,v7); + q = _mm256_mullo_epi16(q,v4591_16); + return _mm256_sub_epi16(x,q); +} + +static inline __m256i squeezeadd16(__m256i x,__m256i y) +{ + __m256i q; + x = _mm256_add_epi16(x,y); + q = _mm256_mulhrs_epi16(x,v7); + q = _mm256_mullo_epi16(q,v4591_16); + return _mm256_sub_epi16(x,q); +} + +static inline __m256i freeze(__m256i x) +{ + __m256i mask, x2296, x4591; + x4591 = _mm256_add_epi16(x,v4591_16); + mask = _mm256_srai_epi16(x,15); + x = _mm256_blendv_epi8(x,x4591,mask); + x2296 = _mm256_sub_epi16(x,v2296_16); + mask = _mm256_srai_epi16(x2296,15); + x4591 = _mm256_sub_epi16(x,v4591_16); + x = _mm256_blendv_epi8(x4591,x,mask); + return x; +} + +/* 24*8*float32 f inputs between -10000 and 10000 */ +/* 24*8*float32 g inputs between -32 and 32 */ +/* 48*8*float32 h outputs between -7680000 and 7680000 */ +static void mult24x8_float(__m256 h[48],const __m256 f[24],const __m256 g[24]) +{ + int i, j; + __m256 f0, f1, f2, f3, f4, f5, gj, h0, h1, h2, h3, h4; + + i = 0; + f0 = f[i]; + f1 = f[i + 1]; + f2 = f[i + 2]; + f3 = f[i + 3]; + f4 = f[i + 4]; + f5 = f[i + 5]; + MULSTEP_fromzero(0,h0,h1,h2,h3,h4) + for (j = 0;j < 20;j += 5) { + MULSTEP_noload(j + 1,h1,h2,h3,h4,h0) + MULSTEP_noload(j + 2,h2,h3,h4,h0,h1) + MULSTEP_noload(j + 3,h3,h4,h0,h1,h2) + MULSTEP_noload(j + 4,h4,h0,h1,h2,h3) + MULSTEP_noload(j + 5,h0,h1,h2,h3,h4) + } + MULSTEP_noload(j + 1,h1,h2,h3,h4,h0) + MULSTEP_noload(j + 2,h2,h3,h4,h0,h1) + MULSTEP_noload(j + 3,h3,h4,h0,h1,h2) + h[i + j + 4] = h4; + h[i + j + 5] = h0; + h[i + j + 6] = h1; + h[i + j + 7] = h2; + h[i + j + 8] = h3; + + for (i = 6;i < 24;i += 6) { + f0 = f[i]; + f1 = f[i + 1]; + f2 = f[i + 2]; + f3 = f[i + 3]; + f4 = f[i + 4]; + f5 = f[i + 5]; + h0 = h[i]; + h1 = h[i + 1]; + h2 = h[i + 2]; + h3 = h[i + 3]; + h4 = h[i + 4]; + for (j = 0;j < 15;j += 5) { + MULSTEP(j + 0,h0,h1,h2,h3,h4) + MULSTEP(j + 1,h1,h2,h3,h4,h0) + MULSTEP(j + 2,h2,h3,h4,h0,h1) + MULSTEP(j + 3,h3,h4,h0,h1,h2) + MULSTEP(j + 4,h4,h0,h1,h2,h3) + } + MULSTEP(j + 0,h0,h1,h2,h3,h4) + MULSTEP(j + 1,h1,h2,h3,h4,h0) + MULSTEP(j + 2,h2,h3,h4,h0,h1) + MULSTEP_noload(j + 3,h3,h4,h0,h1,h2) + MULSTEP_noload(j + 4,h4,h0,h1,h2,h3) + MULSTEP_noload(j + 5,h0,h1,h2,h3,h4) + MULSTEP_noload(j + 6,h1,h2,h3,h4,h0) + MULSTEP_noload(j + 7,h2,h3,h4,h0,h1) + MULSTEP_noload(j + 8,h3,h4,h0,h1,h2) + h[i + j + 9] = h4; + h[i + j + 10] = h0; + h[i + j + 11] = h1; + h[i + j + 12] = h2; + h[i + j + 13] = h3; + } + + h[47] = v0_float; +} + +/* 48*8*float32 f inputs between -5000 and 5000 */ +/* 48*8*float32 g inputs between -16 and 16 */ +/* 96*8*float32 h outputs between -3840000 and 3840000 */ +static void mult48x8_float(__m256 h[96],const __m256 f[48],const __m256 g[48]) +{ + __m256 h01[48]; + __m256 g01[24]; + __m256 *f01 = h01 + 24; + int i; + + for (i = 24;i > 0;) { + i -= 2; + f01[i] = f[i] + f[i + 24]; + g01[i] = g[i] + g[i + 24]; + f01[i + 1] = f[i + 1] + f[i + 1 + 24]; + g01[i + 1] = g[i + 1] + g[i + 1 + 24]; + } + + mult24x8_float(h,f,g); + mult24x8_float(h + 48,f + 24,g + 24); + mult24x8_float(h01,f01,g01); + + for (i = 0;i < 24;++i) { + __m256 h0i = h[i]; + __m256 h0itop = h[i + 24]; + __m256 h1i = h[i + 48]; + __m256 h1itop = h[i + 72]; + __m256 h01i = h01[i]; + __m256 h01itop = h01[i + 24]; + __m256 c = fastsub(h0itop,h1i); + h[i + 24] = c + fastsub(h01i,h0i); + h[i + 48] = fastsub(h01itop,h1itop) - c; + } +} + +/* 96*8*float32 f inputs between -2500 and 2500 */ +/* 96*8*float32 g inputs between -8 and 8 */ +/* 192*8*float32 h outputs between -1920000 and 1920000 */ +static void mult96x8_float(__m256 h[192],const __m256 f[96],const __m256 g[96]) +{ + __m256 h01[96]; + __m256 g01[48]; + __m256 *f01 = h01 + 48; + int i; + + for (i = 48;i > 0;) { + i -= 4; + f01[i] = f[i] + f[i + 48]; + g01[i] = g[i] + g[i + 48]; + f01[i + 1] = f[i + 1] + f[i + 1 + 48]; + g01[i + 1] = g[i + 1] + g[i + 1 + 48]; + f01[i + 2] = f[i + 2] + f[i + 2 + 48]; + g01[i + 2] = g[i + 2] + g[i + 2 + 48]; + f01[i + 3] = f[i + 3] + f[i + 3 + 48]; + g01[i + 3] = g[i + 3] + g[i + 3 + 48]; + } + + mult48x8_float(h,f,g); + mult48x8_float(h + 96,f + 48,g + 48); + mult48x8_float(h01,f01,g01); + + for (i = 0;i < 48;++i) { + __m256 h0i = h[i]; + __m256 h0itop = h[i + 48]; + __m256 h1i = h[i + 96]; + __m256 h1itop = h[i + 144]; + __m256 h01i = h01[i]; + __m256 h01itop = h01[i + 48]; + __m256 c = fastsub(h0itop,h1i); + h[i + 48] = c + fastsub(h01i,h0i); + h[i + 96] = fastsub(h01itop,h1itop) - c; + } +} + +/* 96*16*int16 f inputs between -2500 and 2500 */ +/* 96*(16*int8 stored in 32*int8) g inputs between -8 and 8 */ +/* 192*16*int16 h outputs between -2400 and 2400 */ +static void mult96x16(__m256i h[192],const __m256i f[96],const __m256i g[96]) +{ + __m256 hfloat[192]; + __m256 gfloat[96]; + __m256 *ffloat = hfloat + 96; + int i, p; + + for (p = 0;p < 2;++p) { + for (i = 96;i > 0;) { + i -= 2; + __m256i fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i])); + __m256i gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i])); + __m256 storage; + *(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32); + ffloat[i] = storage - alpha_float; + *(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32); + gfloat[i] = storage - alpha_float; + fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i + 1])); + gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i + 1])); + *(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32); + ffloat[i + 1] = storage - alpha_float; + *(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32); + gfloat[i + 1] = storage - alpha_float; + } + mult96x8_float(hfloat,ffloat,gfloat); + for (i = 192;i > 0;) { + __m128i h0, h1; + i -= 4; + hfloat[i] = add(alpha_float,reduce(hfloat[i])); + hfloat[i + 1] = fastadd(alpha_float,reduce(hfloat[i + 1])); + hfloat[i + 2] = add(alpha_float,reduce(hfloat[i + 2])); + hfloat[i + 3] = fastadd(alpha_float,reduce(hfloat[i + 3])); + h0 = 0[(__m128i *) &hfloat[i]]; h0 = _mm_sub_epi32(h0,alpha_32_128); + h1 = 1[(__m128i *) &hfloat[i]]; h1 = _mm_sub_epi32(h1,alpha_32_128); + _mm_storeu_si128(p + (__m128i *) &h[i],_mm_packs_epi32(h0,h1)); + h0 = 0[(__m128i *) &hfloat[i + 1]]; h0 = _mm_sub_epi32(h0,alpha_32_128); + h1 = 1[(__m128i *) &hfloat[i + 1]]; h1 = _mm_sub_epi32(h1,alpha_32_128); + _mm_storeu_si128(p + (__m128i *) &h[i + 1],_mm_packs_epi32(h0,h1)); + h0 = 0[(__m128i *) &hfloat[i + 2]]; h0 = _mm_sub_epi32(h0,alpha_32_128); + h1 = 1[(__m128i *) &hfloat[i + 2]]; h1 = _mm_sub_epi32(h1,alpha_32_128); + _mm_storeu_si128(p + (__m128i *) &h[i + 2],_mm_packs_epi32(h0,h1)); + h0 = 0[(__m128i *) &hfloat[i + 3]]; h0 = _mm_sub_epi32(h0,alpha_32_128); + h1 = 1[(__m128i *) &hfloat[i + 3]]; h1 = _mm_sub_epi32(h1,alpha_32_128); + _mm_storeu_si128(p + (__m128i *) &h[i + 3],_mm_packs_epi32(h0,h1)); + } + } +} + +/* int16 i of output x[j] is int16 j of input x[i] */ +static void transpose16(__m256i x[16]) +{ + const static int rev[4] = {0,4,2,6}; + int i; + __m256i y[16]; + + for (i = 0;i < 16;i += 4) { + __m256i a0 = x[i]; + __m256i a1 = x[i + 1]; + __m256i a2 = x[i + 2]; + __m256i a3 = x[i + 3]; + __m256i b0 = _mm256_unpacklo_epi16(a0,a1); + __m256i b1 = _mm256_unpackhi_epi16(a0,a1); + __m256i b2 = _mm256_unpacklo_epi16(a2,a3); + __m256i b3 = _mm256_unpackhi_epi16(a2,a3); + __m256i c0 = _mm256_unpacklo_epi32(b0,b2); + __m256i c2 = _mm256_unpackhi_epi32(b0,b2); + __m256i c1 = _mm256_unpacklo_epi32(b1,b3); + __m256i c3 = _mm256_unpackhi_epi32(b1,b3); + y[i] = c0; + y[i + 2] = c2; + y[i + 1] = c1; + y[i + 3] = c3; + } + for (i = 0;i < 4;++i) { + int r = rev[i]; + __m256i c0 = y[i]; + __m256i c4 = y[i + 4]; + __m256i c8 = y[i + 8]; + __m256i c12 = y[i + 12]; + __m256i d0 = _mm256_unpacklo_epi64(c0,c4); + __m256i d4 = _mm256_unpackhi_epi64(c0,c4); + __m256i d8 = _mm256_unpacklo_epi64(c8,c12); + __m256i d12 = _mm256_unpackhi_epi64(c8,c12); + __m256i e0 = _mm256_permute2x128_si256(d0,d8,0x20); + __m256i e8 = _mm256_permute2x128_si256(d0,d8,0x31); + __m256i e4 = _mm256_permute2x128_si256(d4,d12,0x20); + __m256i e12 = _mm256_permute2x128_si256(d4,d12,0x31); + x[r] = e0; + x[r + 8] = e8; + x[r + 1] = e4; + x[r + 9] = e12; + } +} + +/* byte i of output x[j] is byte j of input x[i] */ +static void transpose32(__m256i x[32]) +{ + const static int rev[4] = {0,8,4,12}; + int i; + __m256i y[32]; + + for (i = 0;i < 32;i += 4) { + __m256i a0 = x[i]; + __m256i a1 = x[i + 1]; + __m256i a2 = x[i + 2]; + __m256i a3 = x[i + 3]; + __m256i b0 = _mm256_unpacklo_epi8(a0,a1); + __m256i b1 = _mm256_unpackhi_epi8(a0,a1); + __m256i b2 = _mm256_unpacklo_epi8(a2,a3); + __m256i b3 = _mm256_unpackhi_epi8(a2,a3); + __m256i c0 = _mm256_unpacklo_epi16(b0,b2); + __m256i c2 = _mm256_unpackhi_epi16(b0,b2); + __m256i c1 = _mm256_unpacklo_epi16(b1,b3); + __m256i c3 = _mm256_unpackhi_epi16(b1,b3); + y[i] = c0; + y[i + 2] = c2; + y[i + 1] = c1; + y[i + 3] = c3; + } + for (i = 0;i < 4;++i) { + int r = rev[i]; + __m256i c0 = y[i]; + __m256i c8 = y[i + 8]; + __m256i c16 = y[i + 16]; + __m256i c24 = y[i + 24]; + __m256i c4 = y[i + 4]; + __m256i c12 = y[i + 12]; + __m256i c20 = y[i + 20]; + __m256i c28 = y[i + 28]; + __m256i d0 = _mm256_unpacklo_epi32(c0,c4); + __m256i d4 = _mm256_unpackhi_epi32(c0,c4); + __m256i d8 = _mm256_unpacklo_epi32(c8,c12); + __m256i d12 = _mm256_unpackhi_epi32(c8,c12); + __m256i d16 = _mm256_unpacklo_epi32(c16,c20); + __m256i d20 = _mm256_unpackhi_epi32(c16,c20); + __m256i d24 = _mm256_unpacklo_epi32(c24,c28); + __m256i d28 = _mm256_unpackhi_epi32(c24,c28); + __m256i e0 = _mm256_unpacklo_epi64(d0,d8); + __m256i e8 = _mm256_unpackhi_epi64(d0,d8); + __m256i e16 = _mm256_unpacklo_epi64(d16,d24); + __m256i e24 = _mm256_unpackhi_epi64(d16,d24); + __m256i e4 = _mm256_unpacklo_epi64(d4,d12); + __m256i e12 = _mm256_unpackhi_epi64(d4,d12); + __m256i e20 = _mm256_unpacklo_epi64(d20,d28); + __m256i e28 = _mm256_unpackhi_epi64(d20,d28); + __m256i f0 = _mm256_permute2x128_si256(e0,e16,0x20); + __m256i f16 = _mm256_permute2x128_si256(e0,e16,0x31); + __m256i f8 = _mm256_permute2x128_si256(e8,e24,0x20); + __m256i f24 = _mm256_permute2x128_si256(e8,e24,0x31); + __m256i f4 = _mm256_permute2x128_si256(e4,e20,0x20); + __m256i f20 = _mm256_permute2x128_si256(e4,e20,0x31); + __m256i f12 = _mm256_permute2x128_si256(e12,e28,0x20); + __m256i f28 = _mm256_permute2x128_si256(e12,e28,0x31); + x[r] = f0; + x[r + 16] = f16; + x[r + 1] = f8; + x[r + 17] = f24; + x[r + 2] = f4; + x[r + 18] = f20; + x[r + 3] = f12; + x[r + 19] = f28; + } +} + +/* 48*16*int16 f inputs between -2295 and 2295 */ +/* 24*32*int8 g inputs between -1 and 1 */ +/* 96*16*int16 h outputs between -2295 and 2295 */ +static void mult768_mix2_m256i(__m256i h[96],const __m256i f[48],const __m256i g[24]) +{ + __m256i hkara[24][16]; + __m256i gkara[3][32]; +#define fkara hkara + int i; + + for (i = 6;i-- > 0;) { + __m256i f0, f1, f2, f3, f4, f5, f6, f7; + __m256i f01, f23, f45, f67; + __m256i f02, f46, f04, f26, f0426; + __m256i f13, f57, f15, f37, f1537; + __m256i f0213, f4657, f04261537, f0415, f2637; + + f0 = _mm256_loadu_si256(&f[i + 0]); + f1 = _mm256_loadu_si256(&f[i + 6]); + f2 = _mm256_loadu_si256(&f[i + 12]); + f3 = _mm256_loadu_si256(&f[i + 18]); + f4 = _mm256_loadu_si256(&f[i + 24]); + f5 = _mm256_loadu_si256(&f[i + 30]); + f6 = _mm256_loadu_si256(&f[i + 36]); + f7 = _mm256_loadu_si256(&f[i + 42]); + f01 = squeezeadd16(f0,f1); fkara[i][8] = f01; + f23 = squeezeadd16(f2,f3); fkara[i][9] = f23; + f45 = squeezeadd16(f4,f5); fkara[i][10] = f45; + f67 = squeezeadd16(f6,f7); fkara[i][11] = f67; + + fkara[i][0] = f0; + fkara[i][2] = f2; + fkara[i][4] = f4; + fkara[i][6] = f6; + + f02 = squeezeadd16(f0,f2); fkara[i + 6][0] = f02; + f04 = squeezeadd16(f0,f4); fkara[i + 6][6] = f04; + f46 = squeezeadd16(f4,f6); fkara[i + 6][3] = f46; + f26 = squeezeadd16(f2,f6); fkara[i + 6][8] = f26; + + fkara[i][1] = f1; + fkara[i][3] = f3; + fkara[i][5] = f5; + fkara[i][7] = f7; + + f13 = squeezeadd16(f1,f3); fkara[i + 6][1] = f13; + f15 = squeezeadd16(f1,f5); fkara[i + 6][7] = f15; + f57 = squeezeadd16(f5,f7); fkara[i + 6][4] = f57; + f37 = squeezeadd16(f3,f7); fkara[i + 6][9] = f37; + + f0426 = squeezeadd16(f04,f26); fkara[i + 6][12] = f0426; + f1537 = squeezeadd16(f15,f37); fkara[i + 6][13] = f1537; + f0213 = squeezeadd16(f02,f13); fkara[i + 6][2] = f0213; + f4657 = squeezeadd16(f46,f57); fkara[i + 6][5] = f4657; + f0415 = squeezeadd16(f04,f15); fkara[i + 6][10] = f0415; + f2637 = squeezeadd16(f26,f37); fkara[i + 6][11] = f2637; + f04261537 = squeezeadd16(f0426,f1537); fkara[i + 6][14] = f04261537; + + fkara[i][12] = v0; + fkara[i][13] = v0; + fkara[i][14] = v0; + fkara[i][15] = v0; + fkara[i + 6][15] = v0; + } + + for (i = 3;i-- > 0;) { + __m256i g0, g1, g2, g3, g4, g5, g6, g7; + __m256i g01, g23, g45, g67; + __m256i g02, g46, g04, g26, g0426; + __m256i g13, g57, g15, g37, g1537; + __m256i g0213, g4657, g04261537, g0415, g2637; + + g0 = _mm256_loadu_si256(&g[i + 0]); + g1 = _mm256_loadu_si256(&g[i + 3]); + g2 = _mm256_loadu_si256(&g[i + 6]); + g3 = _mm256_loadu_si256(&g[i + 9]); + g4 = _mm256_loadu_si256(&g[i + 12]); + g5 = _mm256_loadu_si256(&g[i + 15]); + g6 = _mm256_loadu_si256(&g[i + 18]); + g7 = _mm256_loadu_si256(&g[i + 21]); + g01 = _mm256_add_epi8(g0,g1); gkara[i][8] = g01; + g23 = _mm256_add_epi8(g2,g3); gkara[i][9] = g23; + g45 = _mm256_add_epi8(g4,g5); gkara[i][10] = g45; + g67 = _mm256_add_epi8(g6,g7); gkara[i][11] = g67; + + gkara[i][0] = g0; + gkara[i][2] = g2; + gkara[i][4] = g4; + gkara[i][6] = g6; + + g02 = _mm256_add_epi8(g0,g2); gkara[i][16] = g02; + g04 = _mm256_add_epi8(g0,g4); gkara[i][22] = g04; + g46 = _mm256_add_epi8(g4,g6); gkara[i][19] = g46; + g26 = _mm256_add_epi8(g2,g6); gkara[i][24] = g26; + + gkara[i][1] = g1; + gkara[i][3] = g3; + gkara[i][5] = g5; + gkara[i][7] = g7; + + g13 = _mm256_add_epi8(g1,g3); gkara[i][17] = g13; + g15 = _mm256_add_epi8(g1,g5); gkara[i][23] = g15; + g57 = _mm256_add_epi8(g5,g7); gkara[i][20] = g57; + g37 = _mm256_add_epi8(g3,g7); gkara[i][25] = g37; + + g0426 = _mm256_add_epi8(g04,g26); gkara[i][28] = g0426; + g1537 = _mm256_add_epi8(g15,g37); gkara[i][29] = g1537; + g0213 = _mm256_add_epi8(g02,g13); gkara[i][18] = g0213; + g4657 = _mm256_add_epi8(g46,g57); gkara[i][21] = g4657; + g0415 = _mm256_add_epi8(g04,g15); gkara[i][26] = g0415; + g2637 = _mm256_add_epi8(g26,g37); gkara[i][27] = g2637; + g04261537 = _mm256_add_epi8(g0426,g1537); gkara[i][30] = g04261537; + + gkara[i][12] = v0; + gkara[i][13] = v0; + gkara[i][14] = v0; + gkara[i][15] = v0; + gkara[i][31] = v0; + } + + for (i = 12;i-- > 0;) + transpose16(fkara[i]); + for (i = 3;i-- > 0;) + transpose32(gkara[i]); + + mult96x16(hkara[12],fkara[6],(__m256i *) (1 + (__m128i *) gkara)); + mult96x16(hkara[0],fkara[0],gkara[0]); + + for (i = 24;i-- > 0;) + transpose16(hkara[i]); + + for (i = 6;i-- > 0;) { + __m256i h0,h1,h2,h3,h4,h5,h6,h7,h8,h9; + __m256i h10,h11,h12,h13,h14,h15,h16,h17,h18,h19; + __m256i h20,h21,h22,h23; + __m256i h32,h33,h34,h35,h36,h37,h38,h39; + __m256i h40,h41,h42,h43,h44,h45,h46,h47,h48,h49; + __m256i h50,h51,h52,h53,h54,h55,h56,h57,h58,h59; + __m256i h60,h61; + __m256i c; + +#define COMBINE(h0,h1,h2,h3,x0,x1) \ + c = _mm256_sub_epi16(h1,h2); \ + h1 = _mm256_sub_epi16(_mm256_add_epi16(c,x0),h0); \ + h2 = _mm256_sub_epi16(x1,_mm256_add_epi16(c,h3)); \ + h1 = squeeze(h1); \ + h2 = squeeze(h2); + + h56 = hkara[i + 12][12]; + h57 = hkara[i + 18][12]; + h58 = hkara[i + 12][13]; + h59 = hkara[i + 18][13]; + h60 = hkara[i + 12][14]; + h61 = hkara[i + 18][14]; + COMBINE(h56,h57,h58,h59,h60,h61) + + h44 = hkara[i + 12][6]; + h45 = hkara[i + 18][6]; + h46 = hkara[i + 12][7]; + h47 = hkara[i + 18][7]; + h52 = hkara[i + 12][10]; + h53 = hkara[i + 18][10]; + COMBINE(h44,h45,h46,h47,h52,h53) + + h48 = hkara[i + 12][8]; + h49 = hkara[i + 18][8]; + h50 = hkara[i + 12][9]; + h51 = hkara[i + 18][9]; + h54 = hkara[i + 12][11]; + h55 = hkara[i + 18][11]; + COMBINE(h48,h49,h50,h51,h54,h55) + COMBINE(h44,h46,h48,h50,h56,h58) + COMBINE(h45,h47,h49,h51,h57,h59) + + h0 = hkara[i][0]; + h1 = hkara[i + 6][0]; + h2 = hkara[i][1]; + h3 = hkara[i + 6][1]; + h16 = hkara[i][8]; + h17 = hkara[i + 6][8]; + COMBINE(h0,h1,h2,h3,h16,h17) + + h4 = hkara[i][2]; + h5 = hkara[i + 6][2]; + h6 = hkara[i][3]; + h7 = hkara[i + 6][3]; + h18 = hkara[i][9]; + h19 = hkara[i + 6][9]; + COMBINE(h4,h5,h6,h7,h18,h19) + + h32 = hkara[i + 12][0]; + h33 = hkara[i + 18][0]; + h34 = hkara[i + 12][1]; + h35 = hkara[i + 18][1]; + h36 = hkara[i + 12][2]; + h37 = hkara[i + 18][2]; + COMBINE(h32,h33,h34,h35,h36,h37) + COMBINE(h1,h3,h5,h7,h33,h35) + COMBINE(h0,h2,h4,h6,h32,h34) + + h8 = hkara[i][4]; + h9 = hkara[i + 6][4]; + h10 = hkara[i][5]; + h11 = hkara[i + 6][5]; + h20 = hkara[i][10]; + h21 = hkara[i + 6][10]; + COMBINE(h8,h9,h10,h11,h20,h21) + + h12 = hkara[i][6]; + h13 = hkara[i + 6][6]; + h14 = hkara[i][7]; + h15 = hkara[i + 6][7]; + h22 = hkara[i][11]; + h23 = hkara[i + 6][11]; + COMBINE(h12,h13,h14,h15,h22,h23) + + h38 = hkara[i + 12][3]; + h39 = hkara[i + 18][3]; + h40 = hkara[i + 12][4]; + h41 = hkara[i + 18][4]; + h42 = hkara[i + 12][5]; + h43 = hkara[i + 18][5]; + COMBINE(h38,h39,h40,h41,h42,h43) + COMBINE(h8,h10,h12,h14,h38,h40) + COMBINE(h9,h11,h13,h15,h39,h41) + + COMBINE(h0,h4,h8,h12,h44,h48) + h0 = freeze(h0); + h4 = freeze(h4); + h8 = freeze(h8); + h12 = freeze(h12); + _mm256_storeu_si256(&h[i + 0],h0); + _mm256_storeu_si256(&h[i + 24],h4); + _mm256_storeu_si256(&h[i + 48],h8); + _mm256_storeu_si256(&h[i + 72],h12); + + COMBINE(h1,h5,h9,h13,h45,h49) + h1 = freeze(h1); + h5 = freeze(h5); + h9 = freeze(h9); + h13 = freeze(h13); + _mm256_storeu_si256(&h[i + 6],h1); + _mm256_storeu_si256(&h[i + 30],h5); + _mm256_storeu_si256(&h[i + 54],h9); + _mm256_storeu_si256(&h[i + 78],h13); + + COMBINE(h2,h6,h10,h14,h46,h50) + h2 = freeze(h2); + h6 = freeze(h6); + h10 = freeze(h10); + h14 = freeze(h14); + _mm256_storeu_si256(&h[i + 12],h2); + _mm256_storeu_si256(&h[i + 36],h6); + _mm256_storeu_si256(&h[i + 60],h10); + _mm256_storeu_si256(&h[i + 84],h14); + + COMBINE(h3,h7,h11,h15,h47,h51) + h3 = freeze(h3); + h7 = freeze(h7); + h11 = freeze(h11); + h15 = freeze(h15); + _mm256_storeu_si256(&h[i + 18],h3); + _mm256_storeu_si256(&h[i + 42],h7); + _mm256_storeu_si256(&h[i + 66],h11); + _mm256_storeu_si256(&h[i + 90],h15); + } +} + +#define p 761 + +/* 761 f inputs between -2295 and 2295 */ +/* 761 g inputs between -1 and 1 */ +/* 761 h outputs between -2295 and 2295 */ +void rq_mult(modq *h,const modq *f,const small *g) +{ + __m256i fgvec[96]; + modq *fg; + int i; + + mult768_mix2_m256i(fgvec,(__m256i *) f,(__m256i *) g); + fg = (modq *) fgvec; + + h[0] = modq_freeze(fg[0] + fg[p]); + for (i = 1;i < 9;++i) + h[i] = modq_freeze(fg[i] + fg[i + p - 1] + fg[i + p]); + for (i = 9;i < 761;i += 16) { + __m256i fgi = _mm256_loadu_si256((__m256i *) &fg[i]); + __m256i fgip = _mm256_loadu_si256((__m256i *) &fg[i + p]); + __m256i fgip1 = _mm256_loadu_si256((__m256i *) &fg[i + p - 1]); + __m256i x = _mm256_add_epi16(fgi,_mm256_add_epi16(fgip,fgip1)); + x = freeze(squeeze(x)); + _mm256_storeu_si256((__m256i *) &h[i],x); + } + for (i = 761;i < 768;++i) + h[i] = 0; +} + +void r3_mult(small *h,const small *f,const small *g) +{ + __m256i fgvec[96]; + __m256i fvec[48]; + modq *fg; + int i; + + memset(fvec,0,sizeof fvec); + + for (i = 0;i < 761;++i) + i[(modq *) fvec] = f[i]; + + mult768_mix2_m256i(fgvec,fvec,(__m256i *) g); + fg = (modq *) fgvec; + + h[0] = mod3_freeze(fg[0] + fg[p]); + for (i = 1;i < p;++i) + h[i] = mod3_freeze(fg[i] + fg[i + p - 1] + fg[i + p]); + for (i = p;i < 768;++i) + h[i] = 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/params.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/params.h new file mode 100644 index 000000000..655e6ec09 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/params.h @@ -0,0 +1,14 @@ +#ifndef params_h +#define params_h + +#define q 4591 +/* XXX: also built into modq in various ways */ + +#define qshift 2295 +#define p 761 +#define w 286 + +#define rq_encode_len 1218 +#define small_encode_len 191 + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3.h new file mode 100644 index 000000000..c2f2e5e27 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3.h @@ -0,0 +1,15 @@ +#ifndef r3_h +#define r3_h + +#include "small.h" + +#define r3_mult crypto_kem_sntrup4591761_avx_r3_mult +extern void r3_mult(small *,const small *,const small *); + +#define r3_recip crypto_kem_sntrup4591761_avx_r3_recip +extern int r3_recip(small *,const small *); + +#define r3_weightw_mask crypto_kem_sntrup4591761_avx_r3_weightw_mask +extern int r3_weightw_mask(const small *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3_recip.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3_recip.c new file mode 100644 index 000000000..6c8526803 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/r3_recip.c @@ -0,0 +1,192 @@ +#include +#include "params.h" +#include "mod3.h" +#include "swap.h" +#include "r3.h" + +/* caller must ensure that x-y does not overflow */ +static int smaller_mask(int x,int y) +{ + return (x - y) >> 31; +} + +static void vectormod3_product(small *z,int len,const small *x,const small c) +{ + int i; + int minusmask = c; + int plusmask = -c; + __m256i minusvec, plusvec, zerovec; + + minusmask >>= 31; + plusmask >>= 31; + minusvec = _mm256_set1_epi32(minusmask); + plusvec = _mm256_set1_epi32(plusmask); + zerovec = _mm256_set1_epi32(0); + + while (len >= 32) { + __m256i xi = _mm256_loadu_si256((__m256i *) x); + xi = (xi & plusvec) | (_mm256_sub_epi8(zerovec,xi) & minusvec); + _mm256_storeu_si256((__m256i *) z,xi); + x += 32; + z += 32; + len -= 32; + } + + for (i = 0;i < len;++i) z[i] = mod3_product(x[i],c); +} + +static void vectormod3_minusproduct(small *z,int len,const small *x,const small *y,const small c) +{ + int i; + int minusmask = c; + int plusmask = -c; + __m256i minusvec, plusvec, zerovec, twovec, fourvec; + + minusmask >>= 31; + plusmask >>= 31; + minusvec = _mm256_set1_epi32(minusmask); + plusvec = _mm256_set1_epi32(plusmask); + zerovec = _mm256_set1_epi32(0); + twovec = _mm256_set1_epi32(0x02020202); + fourvec = _mm256_set1_epi32(0x04040404); + + while (len >= 32) { + __m256i xi = _mm256_loadu_si256((__m256i *) x); + __m256i yi = _mm256_loadu_si256((__m256i *) y); + __m256i r; + yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec,yi) & minusvec); + xi = _mm256_sub_epi8(xi,yi); + + r = _mm256_add_epi8(xi,twovec); + r &= fourvec; + r = _mm256_srli_epi32(r,2); + xi = _mm256_sub_epi8(xi,r); + r = _mm256_add_epi8(r,r); + xi = _mm256_sub_epi8(xi,r); + + r = _mm256_sub_epi8(twovec,xi); + r &= fourvec; + r = _mm256_srli_epi32(r,2); + xi = _mm256_add_epi8(xi,r); + r = _mm256_add_epi8(r,r); + xi = _mm256_add_epi8(xi,r); + + _mm256_storeu_si256((__m256i *) z,xi); + x += 32; + y += 32; + z += 32; + len -= 32; + } + + for (i = 0;i < len;++i) z[i] = mod3_minusproduct(x[i],y[i],c); +} + +static void vectormod3_shift(small *z,int len) +{ + int i; + while (len >= 33) { + __m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 33)); + _mm256_storeu_si256((__m256i *) (z + len - 32),zi); + len -= 32; + } + for (i = len - 1;i > 0;--i) z[i] = z[i - 1]; + z[0] = 0; +} + +/* +r = s^(-1) mod m, returning 0, if s is invertible mod m +or returning -1 if s is not invertible mod m +r,s are polys of degree

= loops) break; + + c = mod3_quotient(g[p],f[p]); + + vectormod3_minusproduct(g,768,g,f,c); + vectormod3_shift(g,769); + +#ifdef SIMPLER + vectormod3_minusproduct(v,1536,v,u,c); + vectormod3_shift(v,1537); +#else + if (loop < p) { + vectormod3_minusproduct(v,loop + 1,v,u,c); + vectormod3_shift(v,loop + 2); + } else { + vectormod3_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c); + vectormod3_shift(v + loop - p,p + 2); + } +#endif + + e -= 1; + + ++loop; + + swapmask = smaller_mask(e,d) & mod3_nonzero_mask(g[p]); + swap(&e,&d,sizeof e,swapmask); + swap(f,g,(p + 1) * sizeof(small),swapmask); + +#ifdef SIMPLER + swap(u,v,1536 * sizeof(small),swapmask); +#else + if (loop < p) { + swap(u,v,(loop + 1) * sizeof(small),swapmask); + } else { + swap(u + loop - p,v + loop - p,(p + 1) * sizeof(small),swapmask); + } +#endif + } + + c = mod3_reciprocal(f[p]); + vectormod3_product(r,p,u + p,c); + for (i = p;i < 768;++i) r[i] = 0; + return smaller_mask(0,d); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomsmall.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomsmall.c new file mode 100644 index 000000000..9602a9d6a --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomsmall.c @@ -0,0 +1,17 @@ +#include "params.h" +#include "randombytes.h" +#include "crypto_uint32.h" +#include "small.h" + +void small_random(small *g) +{ + crypto_uint32 r[p]; + int i; + + randombytes((unsigned char *) r,sizeof r); + for (i = 0;i < p;++i) + g[i] = (small) (((r[i] & 1073741823) * 3) >> 30) - 1; + /* bias is miniscule */ + for (i = p;i < 768;++i) + g[i] = 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomweightw.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomweightw.c new file mode 100644 index 000000000..efa5945d5 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/randomweightw.c @@ -0,0 +1,17 @@ +#include "params.h" +#include "randombytes.h" +#include "int32_sort.h" +#include "small.h" + +void small_random_weightw(small *f) +{ + crypto_int32 r[p]; + int i; + + randombytes((unsigned char *) r,sizeof r); + for (i = 0;i < w;++i) r[i] &= -2; + for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1; + int32_sort(r,p); + for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1; + for (i = p;i < 768;++i) f[i] = 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.c new file mode 100644 index 000000000..33d655a8c --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.c @@ -0,0 +1,158 @@ +#include +#include "params.h" +#include "crypto_uint32.h" +#include "crypto_int64.h" +#include "rq.h" + +#define v2295_16 _mm256_set1_epi16(2295) +#define v2295_16_128 _mm_set1_epi16(2295) +#define alpha_top _mm256_set1_epi32(0x43380000) +#define alpha _mm256_set1_pd(6755399441055744.0) +#define alpha_64 _mm256_set1_epi64(0x4338000000000000) + +/* each reciprocal is rounded _up_ to nearest floating-point number */ +#define recip54 0.0185185185185185209599811884118025773204863071441650390625 +#define recip4591 0.000217817468961010681817447309782664888189174234867095947265625 +#define recip6144 0.0001627604166666666847367028747584072334575466811656951904296875 +#define recip331776 0.00000301408179012345704632478034235010255770248477347195148468017578125 +#define recip37748736 0.000000026490953233506946282623583451172610825352649044361896812915802001953125 + +#define broadcast(r) _mm256_set1_pd(r) +#define floor(x) _mm256_floor_pd(x) + +void rq_encode(unsigned char *c,const modq *f) +{ + crypto_int32 f0, f1, f2, f3, f4; + int i; + + for (i = 0;i < p/5;++i) { + f0 = *f++ + qshift; + f1 = *f++ + qshift; + f2 = *f++ + qshift; + f3 = *f++ + qshift; + f4 = *f++ + qshift; + /* now want f0 + 6144*f1 + ... as a 64-bit integer */ + f1 *= 3; + f2 *= 9; + f3 *= 27; + f4 *= 81; + /* now want f0 + f1<<11 + f2<<22 + f3<<33 + f4<<44 */ + f0 += f1 << 11; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + f0 += f2 << 6; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + f0 += f3 << 1; + *c++ = f0; f0 >>= 8; + f0 += f4 << 4; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + *c++ = f0; + } + /* XXX: using p mod 5 = 1 */ + f0 = *f++ + qshift; + *c++ = f0; f0 >>= 8; + *c++ = f0; +} + +void rq_decode(modq *f,const unsigned char *c) +{ + crypto_uint32 c0, c1; + int i; + + for (i = 0;i < 152;i += 4) { + __m256i abcd, ac, bd, abcd0, abcd1; + __m256d x0, x1, f4, f3, f2, f1, f0; + __m128i if4, if3, if2, if1, if0; + __m128i x01, x23, x02, x13, xab, xcd; + + /* f0 + f1*6144 + f2*6144^2 + f3*6144^3 + f4*6144^4 */ + /* = c0 + c1*256 + ... + c6*256^6 + c7*256^7 */ + /* with each f between 0 and 4590 */ + + /* could use _mm256_cvtepi32_pd instead; but beware uint32 */ + + abcd = _mm256_loadu_si256((__m256i *) c); /* a0 a1 b0 b1 c0 c1 d0 d1 */ + c += 32; + + ac = _mm256_unpacklo_epi32(abcd,alpha_top); /* a0 a1 c0 c1 */ + bd = _mm256_unpackhi_epi32(abcd,alpha_top); /* b0 b1 d0 d1 */ + abcd1 = _mm256_unpackhi_epi64(ac,bd); /* a1 b1 c1 d1 */ + abcd0 = _mm256_unpacklo_epi64(ac,bd); /* a0 b0 c0 d0 */ + x1 = *(__m256d *) &abcd1; + x0 = *(__m256d *) &abcd0; + + x1 -= alpha; + x0 -= alpha; + + /* x1 is [0,41] + [0,4590]*54 + f4*331776 */ + f4 = broadcast(recip331776) * x1; + f4 = floor(f4); + x1 -= broadcast(331776.0) * f4; + + /* x1 is [0,41] + f3*54 */ + f3 = broadcast(recip54) * x1; + f3 = floor(f3); + x1 -= broadcast(54.0) * f3; + + x0 += broadcast(4294967296.0) * x1; + + /* x0 is [0,4590] + [0,4590]*6144 + f2*6144^2 */ + f2 = broadcast(recip37748736) * x0; + f2 = floor(f2); + x0 -= broadcast(37748736.0) * f2; + + /* x0 is [0,4590] + f1*6144 */ + f1 = broadcast(recip6144) * x0; + f1 = floor(f1); + x0 -= broadcast(6144.0) * f1; + + f0 = x0; + + f4 -= broadcast(4591.0) * floor(broadcast(recip4591) * f4); + f3 -= broadcast(4591.0) * floor(broadcast(recip4591) * f3); + f2 -= broadcast(4591.0) * floor(broadcast(recip4591) * f2); + f1 -= broadcast(4591.0) * floor(broadcast(recip4591) * f1); + f0 -= broadcast(4591.0) * floor(broadcast(recip4591) * f0); + + if4 = _mm256_cvtpd_epi32(f4); /* a4 0 b4 0 c4 0 d4 0 */ + if3 = _mm256_cvtpd_epi32(f3); /* a3 0 b3 0 c3 0 d3 0 */ + if2 = _mm256_cvtpd_epi32(f2); /* a2 0 b2 0 c2 0 d2 0 */ + if1 = _mm256_cvtpd_epi32(f1); /* a1 0 b1 0 c1 0 d1 0 */ + if0 = _mm256_cvtpd_epi32(f0); /* a0 0 b0 0 c0 0 d0 0 */ + + if4 = _mm_sub_epi16(if4,v2295_16_128); + f[4] = _mm_extract_epi32(if4,0); + f[9] = _mm_extract_epi32(if4,1); + f[14] = _mm_extract_epi32(if4,2); + f[19] = _mm_extract_epi32(if4,3); + + x23 = _mm_packs_epi32(if2,if3); /* a2 b2 c2 d2 a3 b3 c3 d3 */ + x01 = _mm_packs_epi32(if0,if1); /* a0 b0 c0 d0 a1 b1 c1 d1 */ + x02 = _mm_unpacklo_epi16(x01,x23); /* a0 a2 b0 b2 c0 c2 d0 d2 */ + x13 = _mm_unpackhi_epi16(x01,x23); /* a1 a3 b1 b3 c1 c3 d1 d3 */ + xab = _mm_unpacklo_epi16(x02,x13); /* a0 a1 a2 a3 b0 b1 b2 b3 */ + xcd = _mm_unpackhi_epi16(x02,x13); /* c0 c1 c2 c3 d0 d1 d2 d3 */ + xab = _mm_sub_epi16(xab,v2295_16_128); + xcd = _mm_sub_epi16(xcd,v2295_16_128); + + *(crypto_int64 *) (f + 0) = _mm_extract_epi64(xab,0); + *(crypto_int64 *) (f + 5) = _mm_extract_epi64(xab,1); + *(crypto_int64 *) (f + 10) = _mm_extract_epi64(xcd,0); + *(crypto_int64 *) (f + 15) = _mm_extract_epi64(xcd,1); + f += 20; + } + + c0 = *c++; + c1 = *c++; + c0 += c1 << 8; + *f++ = modq_freeze(c0 + q - qshift); + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.h new file mode 100644 index 000000000..f9adfb7a3 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq.h @@ -0,0 +1,31 @@ +#ifndef rq_h +#define rq_h + +#include "modq.h" +#include "small.h" + +#define rq_encode crypto_kem_sntrup4591761_avx_rq_encode +extern void rq_encode(unsigned char *,const modq *); + +#define rq_decode crypto_kem_sntrup4591761_avx_rq_decode +extern void rq_decode(modq *,const unsigned char *); + +#define rq_roundencode crypto_kem_sntrup4591761_avx_rq_roundencode +extern void rq_roundencode(unsigned char *,const modq *); + +#define rq_decoderounded crypto_kem_sntrup4591761_avx_rq_decoderounded +extern void rq_decoderounded(modq *,const unsigned char *); + +#define rq_round3 crypto_kem_sntrup4591761_avx_rq_round3 +extern void rq_round3(modq *,const modq *); + +#define rq_mod3 crypto_kem_sntrup4591761_avx_rq_mod3 +extern void rq_mod3(small *,const modq *); + +#define rq_mult crypto_kem_sntrup4591761_avx_rq_mult +extern void rq_mult(modq *,const modq *,const small *); + +#define rq_recip3 crypto_kem_sntrup4591761_avx_rq_recip3 +int rq_recip3(modq *,const small *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_mod3.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_mod3.c new file mode 100644 index 000000000..5f2f2d575 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_mod3.c @@ -0,0 +1,49 @@ +#include +#include "mod3.h" +#include "rq.h" + +#define v3 _mm256_set1_epi16(3) +#define v7 _mm256_set1_epi16(7) +#define v2296_16 _mm256_set1_epi16(2296) +#define v4591_16 _mm256_set1_epi16(4591) +#define v10923_16 _mm256_set1_epi16(10923) + +static inline __m256i squeeze(__m256i x) +{ + __m256i q = _mm256_mulhrs_epi16(x,v7); + q = _mm256_mullo_epi16(q,v4591_16); + return _mm256_sub_epi16(x,q); +} + +static inline __m256i freeze(__m256i x) +{ + __m256i mask, x2296, x4591; + x4591 = _mm256_add_epi16(x,v4591_16); + mask = _mm256_srai_epi16(x,15); + x = _mm256_blendv_epi8(x,x4591,mask); + x2296 = _mm256_sub_epi16(x,v2296_16); + mask = _mm256_srai_epi16(x2296,15); + x4591 = _mm256_sub_epi16(x,v4591_16); + x = _mm256_blendv_epi8(x4591,x,mask); + return x; +} + +void rq_mod3(small *g,const modq *f) +{ + int i; + + for (i = 0;i < 768;i += 16) { + __m256i x = _mm256_loadu_si256((__m256i *) &f[i]); + __m256i q; + x = _mm256_mullo_epi16(x,v3); + x = squeeze(x); + x = freeze(x); + q = _mm256_mulhrs_epi16(x,v10923_16); + x = _mm256_sub_epi16(x,q); + q = _mm256_add_epi16(q,q); + x = _mm256_sub_epi16(x,q); /* g0 g1 ... g15 */ + x = _mm256_packs_epi16(x,x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */ + 0[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,0),0); + 1[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,1),0); + } +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_recip3.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_recip3.c new file mode 100644 index 000000000..5151fd398 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_recip3.c @@ -0,0 +1,215 @@ +#include +#include "params.h" +#include "swap.h" +#include "rq.h" + +#define v7 _mm256_set1_epi16(7) +#define v1827_16 _mm256_set1_epi16(1827) +#define v4591_16 _mm256_set1_epi16(4591) +#define v29234_16 _mm256_set1_epi16(29234) + +/* caller must ensure that x-y does not overflow */ +static int smaller_mask(int x,int y) +{ + return (x - y) >> 31; +} + +static inline __m256i product(__m256i x,__m256i y) +{ + __m256i lo, hi, r0, r1, t0, t1, t, s0, s1; + + lo = _mm256_mullo_epi16(x,y); + hi = _mm256_mulhi_epi16(x,y); + r0 = _mm256_unpacklo_epi16(lo,hi); + r1 = _mm256_unpackhi_epi16(lo,hi); + + t0 = _mm256_srai_epi32(r0,16); + t1 = _mm256_srai_epi32(r1,16); + t = _mm256_packs_epi32(t0,t1); + t = _mm256_mulhrs_epi16(t,v29234_16); + lo = _mm256_mullo_epi16(t,v4591_16); + hi = _mm256_mulhi_epi16(t,v4591_16); + s0 = _mm256_unpacklo_epi16(lo,hi); + s1 = _mm256_unpackhi_epi16(lo,hi); + s0 = _mm256_slli_epi32(s0,4); + s1 = _mm256_slli_epi32(s1,4); + r0 = _mm256_sub_epi32(r0,s0); + r1 = _mm256_sub_epi32(r1,s1); + + t0 = _mm256_srai_epi32(r0,8); + t1 = _mm256_srai_epi32(r1,8); + t = _mm256_packs_epi32(t0,t1); + t = _mm256_mulhrs_epi16(t,v1827_16); + lo = _mm256_mullo_epi16(t,v4591_16); + hi = _mm256_mulhi_epi16(t,v4591_16); + s0 = _mm256_unpacklo_epi16(lo,hi); + s1 = _mm256_unpackhi_epi16(lo,hi); + r0 = _mm256_sub_epi32(r0,s0); + r1 = _mm256_sub_epi32(r1,s1); + + x = _mm256_packs_epi32(r0,r1); + return x; +} + +static inline __m256i minusproduct(__m256i x,__m256i y,__m256i z) +{ + __m256i t; + + x = _mm256_sub_epi16(x,product(y,z)); + t = _mm256_mulhrs_epi16(x,v7); + t = _mm256_mullo_epi16(t,v4591_16); + x = _mm256_sub_epi16(x,t); + return x; +} + +static void vectormodq_product(modq *z,int len,const modq *x,const modq c) +{ + __m256i cvec = _mm256_set1_epi16(c); + while (len >= 16) { + __m256i xi = _mm256_loadu_si256((__m256i *) x); + xi = product(xi,cvec); + _mm256_storeu_si256((__m256i *) z,xi); + x += 16; + z += 16; + len -= 16; + } + while (len > 0) { + *z = modq_product(*x,c); + ++x; + ++z; + --len; + } +} + +static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,const modq c) +{ + __m256i cvec = _mm256_set1_epi16(c); + while (len >= 16) { + __m256i xi = _mm256_loadu_si256((__m256i *) x); + __m256i yi = _mm256_loadu_si256((__m256i *) y); + xi = minusproduct(xi,yi,cvec); + _mm256_storeu_si256((__m256i *) z,xi); + x += 16; + y += 16; + z += 16; + len -= 16; + } + while (len > 0) { + *z = modq_minusproduct(*x,*y,c); + ++x; + ++y; + ++z; + --len; + } +} + +static void vectormodq_shift(modq *z,int len) +{ + int i; + while (len >= 17) { + __m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 17)); + _mm256_storeu_si256((__m256i *) (z + len - 16),zi); + len -= 16; + } + for (i = len - 1;i > 0;--i) z[i] = z[i - 1]; + z[0] = 0; +} + +/* +r = (3s)^(-1) mod m, returning 0, if s is invertible mod m +or returning -1 if s is not invertible mod m +r,s are polys of degree

= loops) break; + + c = modq_quotient(g[p],f[p]); + + vectormodq_minusproduct(g,768,g,f,c); + vectormodq_shift(g,769); + +#ifdef SIMPLER + vectormodq_minusproduct(v,1536,v,u,c); + vectormodq_shift(v,1537); +#else + if (loop < p) { + vectormodq_minusproduct(v,loop + 1,v,u,c); + vectormodq_shift(v,loop + 2); + } else { + vectormodq_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c); + vectormodq_shift(v + loop - p,p + 2); + } +#endif + + e -= 1; + + ++loop; + + swapmask = smaller_mask(e,d) & modq_nonzero_mask(g[p]); + swap(&e,&d,sizeof e,swapmask); + swap(f,g,768 * sizeof(modq),swapmask); + +#ifdef SIMPLER + swap(u,v,1536 * sizeof(modq),swapmask); +#else + if (loop < p) { + swap(u,v,(loop + 1) * sizeof(modq),swapmask); + } else { + swap(u + loop - p,v + loop - p,(p + 1) * sizeof(modq),swapmask); + } +#endif + } + + c = modq_reciprocal(f[p]); + vectormodq_product(r,p,u + p,c); + for (i = 0;i < p;++i) r[i] = modq_freeze(r[i]); + for (i = p;i < 768;++i) r[i] = 0; + return smaller_mask(0,d); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_round3.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_round3.c new file mode 100644 index 000000000..61f4decbb --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_round3.c @@ -0,0 +1,20 @@ +#include +#include "params.h" +#include "rq.h" + +#define v3_16 _mm256_set1_epi16(3) +#define v10923_16 _mm256_set1_epi16(10923) + +void rq_round3(modq *h,const modq *f) +{ + int i; + + for (i = 0;i < 768;i += 16) { + __m256i x = _mm256_loadu_si256((__m256i *) &f[i]); + __m256i x2; + x = _mm256_mulhrs_epi16(x,v10923_16); + x2 = _mm256_add_epi16(x,x); + x = _mm256_add_epi16(x,x2); + _mm256_storeu_si256((__m256i *) &h[i],x); + } +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_rounded.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_rounded.c new file mode 100644 index 000000000..05b674635 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/rq_rounded.c @@ -0,0 +1,260 @@ +#include +#include "params.h" +#include "crypto_uint32.h" +#include "rq.h" + +#define alpha_top _mm256_set1_epi32(0x43380000) +#define alpha _mm256_set1_pd(6755399441055744.0) +#define v10923_16 _mm256_set1_epi16(10923) +#define floor(x) _mm256_floor_pd(x) + +void rq_roundencode(unsigned char *c,const modq *f) +{ + int i; + __m256i h[50]; + + for (i = 0;i < 208;i += 16) { + __m256i a0, a1, a2, b0, b1, b2, c0, c1, c2, d0, d1, d2; + __m256i e0, e1, f0, f1, g0, g1; + a0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[0])); + a1 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[8])); + a2 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *) &f[16])); + a0 = _mm256_inserti128_si256(a0,_mm_loadu_si128((__m128i *) &f[24]),1); + a1 = _mm256_inserti128_si256(a1,_mm_loadu_si128((__m128i *) &f[32]),1); + a2 = _mm256_inserti128_si256(a2,_mm_loadu_si128((__m128i *) &f[40]),1); + f += 48; + + a0 = _mm256_mulhrs_epi16(a0,v10923_16); + a1 = _mm256_mulhrs_epi16(a1,v10923_16); + a2 = _mm256_mulhrs_epi16(a2,v10923_16); + + /* a0: a0 a1 a2 b0 b1 b2 c0 c1 and similar second half */ + /* a1: c2 d0 d1 d2 e0 e1 e2 f0 */ + /* a2: f1 f2 g0 g1 g2 h0 h1 h2 */ + + b1 = _mm256_blend_epi16(a2,a0,0xf0); + b1 = _mm256_shuffle_epi32(b1,0x4e); + b0 = _mm256_blend_epi16(a0,a1,0xf0); + b2 = _mm256_blend_epi16(a1,a2,0xf0); + /* XXX: use shufps instead? */ + + /* b0: a0 a1 a2 b0 e0 e1 e2 f0 */ + /* b1: b1 b2 c0 c1 f1 f2 g0 g1 */ + /* b2: c2 d0 d1 d2 g2 h0 h1 h2 */ + + c1 = _mm256_blend_epi16(b2,b0,0xcc); + c1 = _mm256_shuffle_epi32(c1,0xb1); + c0 = _mm256_blend_epi16(b0,b1,0xcc); + c2 = _mm256_blend_epi16(b1,b2,0xcc); + + /* c0: a0 a1 c0 c1 e0 e1 g0 g1 */ + /* c1: a2 b0 c2 d0 e2 f0 g2 h0 */ + /* c2: b1 b2 d1 d2 f1 f2 h1 h2 */ + + d1 = _mm256_blend_epi16(c2,c0,0xaa); + d1 = _mm256_shufflelo_epi16(d1,0xb1); + d1 = _mm256_shufflehi_epi16(d1,0xb1); + d0 = _mm256_blend_epi16(c0,c1,0xaa); + d2 = _mm256_blend_epi16(c1,c2,0xaa); + + /* d0: a0 b0 c0 d0 e0 f0 g0 h0 */ + /* d1: a1 b1 c1 d1 e1 f1 g1 h1 */ + /* d2: a2 b2 c2 d2 e2 f2 g2 h2 */ + + d0 = _mm256_add_epi16(d0,_mm256_set1_epi16(765)); + d1 = _mm256_add_epi16(d1,_mm256_set1_epi16(765)); + d2 = _mm256_add_epi16(d2,_mm256_set1_epi16(765)); + /* want bytes of d0 + 1536*d1 + 1536*1536*d2 */ + + e0 = d0 & _mm256_set1_epi16(0xff); + d0 = _mm256_srli_epi16(d0,8); + /* want e0, d0 + 6*d1 + 6*1536*d2 */ + + d1 = _mm256_mullo_epi16(d1,_mm256_set1_epi16(6)); + d0 = _mm256_add_epi16(d0,d1); + /* want e0, d0 + 6*1536*d2 */ + + e1 = _mm256_slli_epi16(d0,8); + e0 = _mm256_add_epi16(e0,e1); + d0 = _mm256_srli_epi16(d0,8); + /* want e0, d0 + 36*d2 */ + + d2 = _mm256_mullo_epi16(d2,_mm256_set1_epi16(36)); + e1 = _mm256_add_epi16(d0,d2); + /* want e0, e1 */ + + /* e0: out0 out1 out4 out5 out8 out9 ... */ + /* e1: out2 out3 out6 out7 out10 out11 ... */ + + f0 = _mm256_unpacklo_epi16(e0,e1); + f1 = _mm256_unpackhi_epi16(e0,e1); + + g0 = _mm256_permute2x128_si256(f0,f1,0x20); + g1 = _mm256_permute2x128_si256(f0,f1,0x31); + + _mm256_storeu_si256((__m256i *) c,g0); + _mm256_storeu_si256((__m256i *) (c + 32),g1); + c += 64; + } + + for (i = 0;i < 9;++i) { + __m256i x = _mm256_loadu_si256((__m256i *) &f[16 * i]); + _mm256_storeu_si256(&h[i],_mm256_mulhrs_epi16(x,v10923_16)); + } + f = (const modq *) h; + + for (i = 208;i < 253;++i) { + crypto_int32 f0, f1, f2; + f0 = *f++; + f1 = *f++; + f2 = *f++; + f0 += 1806037245; + f1 *= 3; + f2 *= 9; + f0 += f1 << 9; + f0 += f2 << 18; + *(crypto_int32 *) c = f0; + c += 4; + } + { + crypto_int32 f0, f1; + f0 = *f++; + f1 = *f++; + f0 += 1175805; + f1 *= 3; + f0 += f1 << 9; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + *c++ = f0; + } +} + +void rq_decoderounded(modq *f,const unsigned char *c) +{ + crypto_uint32 c0, c1, c2, c3; + crypto_uint32 f0, f1, f2; + int i; + + for (i = 0;i < 248;i += 8) { + __m256i abcdefgh, todo[2]; + __m256d x, f2, f1, f0; + __m128i if2, if1, if0; + int j; + + abcdefgh = _mm256_loadu_si256((__m256i *) c); + c += 32; + + todo[0] = _mm256_unpacklo_epi32(abcdefgh,alpha_top); + todo[1] = _mm256_unpackhi_epi32(abcdefgh,alpha_top); + + for (j = 0;j < 2;++j) { + x = *(__m256d *) &todo[j]; + x -= alpha; + + /* x is f0 + f1*1536 + f2*1536^2 */ + /* with each f between 0 and 1530 */ + + f2 = x * _mm256_set1_pd(0.00000042385525173611114052197733521876177320564238470979034900665283203125); + f2 = floor(f2); + x -= f2 * _mm256_set1_pd(2359296.0); + + f1 = x * _mm256_set1_pd(0.00065104166666666673894681149903362893383018672466278076171875); + f1 = floor(f1); + x -= f1 * _mm256_set1_pd(1536.0); + + f0 = x; + + f2 -= _mm256_set1_pd(1531.0) * floor(f2 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875)); + f1 -= _mm256_set1_pd(1531.0) * floor(f1 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875)); + f0 -= _mm256_set1_pd(1531.0) * floor(f0 * _mm256_set1_pd(0.0006531678641410842804659875326933615724556148052215576171875)); + + f2 *= _mm256_set1_pd(3.0); f2 -= _mm256_set1_pd(2295.0); + f1 *= _mm256_set1_pd(3.0); f1 -= _mm256_set1_pd(2295.0); + f0 *= _mm256_set1_pd(3.0); f0 -= _mm256_set1_pd(2295.0); + + if2 = _mm256_cvtpd_epi32(f2); /* a2 b2 e2 f2 */ + if1 = _mm256_cvtpd_epi32(f1); /* a1 b1 e1 f1 */ + if0 = _mm256_cvtpd_epi32(f0); /* a0 b0 e0 f0 */ + + f[6*j + 0] = _mm_extract_epi32(if0,0); + f[6*j + 1] = _mm_extract_epi32(if1,0); + f[6*j + 2] = _mm_extract_epi32(if2,0); + f[6*j + 3] = _mm_extract_epi32(if0,1); + f[6*j + 4] = _mm_extract_epi32(if1,1); + f[6*j + 5] = _mm_extract_epi32(if2,1); + + f[6*j + 12] = _mm_extract_epi32(if0,2); + f[6*j + 13] = _mm_extract_epi32(if1,2); + f[6*j + 14] = _mm_extract_epi32(if2,2); + f[6*j + 15] = _mm_extract_epi32(if0,3); + f[6*j + 16] = _mm_extract_epi32(if1,3); + f[6*j + 17] = _mm_extract_epi32(if2,3); + } + + f += 24; + } + + for (i = 248;i < 253;++i) { + c0 = *c++; + c1 = *c++; + c2 = *c++; + c3 = *c++; + + /* f0 + f1*1536 + f2*1536^2 */ + /* = c0 + c1*256 + c2*256^2 + c3*256^3 */ + /* with each f between 0 and 1530 */ + + /* f2 = (64/9)c3 + (1/36)c2 + (1/9216)c1 + (1/2359296)c0 - [0,0.99675] */ + /* claim: 2^21 f2 < x < 2^21(f2+1) */ + /* where x = 14913081*c3 + 58254*c2 + 228*(c1+2) */ + /* proof: x - 2^21 f2 = 456 - (8/9)c0 + (4/9)c1 - (2/9)c2 + (1/9)c3 + 2^21 [0,0.99675] */ + /* at least 456 - (8/9)255 - (2/9)255 > 0 */ + /* at most 456 + (4/9)255 + (1/9)255 + 2^21 0.99675 < 2^21 */ + f2 = (14913081*c3 + 58254*c2 + 228*(c1+2)) >> 21; + + c2 += c3 << 8; + c2 -= (f2 * 9) << 2; + /* f0 + f1*1536 */ + /* = c0 + c1*256 + c2*256^2 */ + /* c2 <= 35 = floor((1530+1530*1536)/256^2) */ + /* f1 = (128/3)c2 + (1/6)c1 + (1/1536)c0 - (1/1536)f0 */ + /* claim: 2^21 f1 < x < 2^21(f1+1) */ + /* where x = 89478485*c2 + 349525*c1 + 1365*(c0+1) */ + /* proof: x - 2^21 f1 = 1365 - (1/3)c2 - (1/3)c1 - (1/3)c0 + (4096/3)f0 */ + /* at least 1365 - (1/3)35 - (1/3)255 - (1/3)255 > 0 */ + /* at most 1365 + (4096/3)1530 < 2^21 */ + f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21; + + c1 += c2 << 8; + c1 -= (f1 * 3) << 1; + + c0 += c1 << 8; + f0 = c0; + + *f++ = modq_freeze(f0 * 3 + q - qshift); + *f++ = modq_freeze(f1 * 3 + q - qshift); + *f++ = modq_freeze(f2 * 3 + q - qshift); + } + + c0 = *c++; + c1 = *c++; + c2 = *c++; + + f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21; + + c1 += c2 << 8; + c1 -= (f1 * 3) << 1; + + c0 += c1 << 8; + f0 = c0; + + *f++ = modq_freeze(f0 * 3 + q - qshift); + *f++ = modq_freeze(f1 * 3 + q - qshift); + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.c new file mode 100644 index 000000000..1d9896404 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.c @@ -0,0 +1,45 @@ +#include +#include "params.h" +#include "small.h" + +/* XXX: these functions rely on p mod 4 = 1 */ + +/* all coefficients in -1, 0, 1 */ +void small_encode(unsigned char *c,const small *f) +{ + small c0; + int i; + + for (i = 0;i < p/4;++i) { + c0 = *f++ + 1; + c0 += (*f++ + 1) << 2; + c0 += (*f++ + 1) << 4; + c0 += (*f++ + 1) << 6; + *c++ = c0; + } + c0 = *f++ + 1; + *c++ = c0; +} + +void small_decode(small *f,const unsigned char *c) +{ + unsigned char c0; + int i; + + for (i = 0;i < p/4;++i) { + c0 = *c++; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; + } + c0 = *c++; + *f++ = ((small) (c0 & 3)) - 1; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; + *f++ = 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.h new file mode 100644 index 000000000..79806d2b5 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/small.h @@ -0,0 +1,20 @@ +#ifndef small_h +#define small_h + +#include "crypto_int8.h" + +typedef crypto_int8 small; + +#define small_encode crypto_kem_sntrup4591761_avx_small_encode +extern void small_encode(unsigned char *,const small *); + +#define small_decode crypto_kem_sntrup4591761_avx_small_decode +extern void small_decode(small *,const unsigned char *); + +#define small_random crypto_kem_sntrup4591761_avx_small_random +extern void small_random(small *); + +#define small_random_weightw crypto_kem_sntrup4591761_avx_small_random_weightw +extern void small_random_weightw(small *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.c new file mode 100644 index 000000000..a404d2988 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.c @@ -0,0 +1,32 @@ +#include +#include "swap.h" + +void swap(void *x,void *y,int bytes,int mask) +{ + char c = mask; + __m256i maskvec = _mm256_set1_epi32(mask); + + while (bytes >= 32) { + __m256i xi = _mm256_loadu_si256(x); + __m256i yi = _mm256_loadu_si256(y); + __m256i xinew = _mm256_blendv_epi8(xi,yi,maskvec); + __m256i yinew = _mm256_blendv_epi8(yi,xi,maskvec); + _mm256_storeu_si256(x,xinew); + _mm256_storeu_si256(y,yinew); + x = 32 + (char *) x; + y = 32 + (char *) y; + bytes -= 32; + } + while (bytes > 0) { + char xi = *(char *) x; + char yi = *(char *) y; + char t = c & (xi ^ yi); + xi ^= t; + yi ^= t; + *(char *) x = xi; + *(char *) y = yi; + ++x; + ++y; + --bytes; + } +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.h new file mode 100644 index 000000000..dcfba2c4a --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/swap.h @@ -0,0 +1,7 @@ +#ifndef swap_h +#define swap_h + +#define swap crypto_kem_sntrup4591761_avx_swap +extern void swap(void *,void *,int,int); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/weight.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/weight.c new file mode 100644 index 000000000..a08383463 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/avx/weight.c @@ -0,0 +1,28 @@ +#include +#include "params.h" +#include "r3.h" +#include "crypto_uint16.h" +#include "crypto_int32.h" + +int r3_weightw_mask(const small *r) +{ + int weight; + int i; + __m256i tally = _mm256_set1_epi32(0); + + for (i = 0;i < 768;i += 16) { + __m256i x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i *) &r[i])); + x &= _mm256_set1_epi32(0x00010001); + tally = _mm256_add_epi16(tally,x); + } + + tally = _mm256_hadd_epi16(tally,tally); + tally = _mm256_hadd_epi16(tally,tally); + tally = _mm256_hadd_epi16(tally,tally); + + weight = _mm_extract_epi16(_mm256_extracti128_si256(tally,0),0) + _mm_extract_epi16(_mm256_extracti128_si256(tally,1),0); + + weight -= w; + + return (-(crypto_int32) (crypto_uint16) weight) >> 30; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumbig b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumbig new file mode 100644 index 000000000..a366c4e70 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumbig @@ -0,0 +1 @@ +83705d49d3a8cb2e16028b86ea6bd44a969b51c2e5114ee02767cf2ddf1aac26 diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumsmall b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumsmall new file mode 100644 index 000000000..d87bd217b --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/checksumsmall @@ -0,0 +1 @@ +336647fe0ed2f6e0d4b15d05e68faec67a81312d769ad3cbee8e0f2de83c2dde diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/description b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/description new file mode 100644 index 000000000..7827a166d --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/description @@ -0,0 +1 @@ +Streamlined NTRU Prime 4591^761 diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/designers b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/designers new file mode 100644 index 000000000..51ac31ea2 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/designers @@ -0,0 +1,5 @@ +Alphabetical order: +Daniel J. Bernstein +Chitchanok Chuengsatiansup +Tanja Lange +Christine van Vredendaal diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/README b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/README new file mode 100644 index 000000000..30265ef71 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/README @@ -0,0 +1,32 @@ +This is a reference implementation of Streamlined NTRU Prime 4591^761. +This implementation is designed primarily for clarity, subject to the +following constraints: + + * The implementation is written in C. The Sage implementation in the + NTRU Prime paper is considerably more concise (and compatible). + + * The implementation avoids data-dependent branches and array + indices. For example, conditional swaps are computed by arithmetic + rather than by branches. + + * The implementation avoids other C operations that often take + variable time. For example, divisions by 3 are computed via + multiplications and shifts. + +This implementation does _not_ sacrifice clarity for speed. + +This implementation has not yet been reviewed for correctness or for +constant-time behavior. It does pass various tests and has no known +bugs, but there are at least some platforms where multiplications take +variable time, and fixing this requires platform-specific effort; see +https://www.bearssl.org/ctmul.html and http://repository.tue.nl/800603. + +This implementation allows "benign malleability" of ciphertexts, as +defined in http://www.shoup.net/papers/iso-2_1.pdf. Specifically, each +32-bit ciphertext word encodes three integers between 0 and 1530; if +larger integers appear then they are silently reduced modulo 1531. +Similar comments apply to public keys. + +There is a separate "avx" implementation where similar comments apply, +except that "avx" _does_ sacrifice clarity for speed on CPUs with AVX2 +instructions. diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/api.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/api.h new file mode 100644 index 000000000..94d75538b --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/api.h @@ -0,0 +1,4 @@ +#define CRYPTO_SECRETKEYBYTES 1600 +#define CRYPTO_PUBLICKEYBYTES 1218 +#define CRYPTO_CIPHERTEXTBYTES 1047 +#define CRYPTO_BYTES 32 diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/dec.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/dec.c new file mode 100644 index 000000000..a7cd22a65 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/dec.c @@ -0,0 +1,71 @@ +#ifdef KAT +#include +#endif + +#include "params.h" +#include "small.h" +#include "mod3.h" +#include "rq.h" +#include "r3.h" +#include "crypto_hash_sha512.h" +#include "crypto_verify_32.h" +#include "crypto_kem.h" + +int crypto_kem_dec( + unsigned char *k, + const unsigned char *cstr, + const unsigned char *sk +) +{ + small f[p]; + modq h[p]; + small grecip[p]; + modq c[p]; + modq t[p]; + small t3[p]; + small r[p]; + modq hr[p]; + unsigned char rstr[small_encode_len]; + unsigned char hash[64]; + int i; + int result = 0; + int weight; + + small_decode(f,sk); + small_decode(grecip,sk + small_encode_len); + rq_decode(h,sk + 2 * small_encode_len); + + rq_decoderounded(c,cstr + 32); + + rq_mult(t,c,f); + for (i = 0;i < p;++i) t3[i] = mod3_freeze(modq_freeze(3*t[i])); + + r3_mult(r,t3,grecip); + +#ifdef KAT + { + int j; + printf("decrypt r:"); + for (j = 0;j < p;++j) + if (r[j] == 1) printf(" +%d",j); + else if (r[j] == -1) printf(" -%d",j); + printf("\n"); + } +#endif + + weight = 0; + for (i = 0;i < p;++i) weight += (1 & r[i]); + weight -= w; + result |= modq_nonzero_mask(weight); /* XXX: puts limit on p */ + + rq_mult(hr,h,r); + rq_round3(hr,hr); + for (i = 0;i < p;++i) result |= modq_nonzero_mask(hr[i] - c[i]); + + small_encode(rstr,r); + crypto_hash_sha512(hash,rstr,sizeof rstr); + result |= crypto_verify_32(hash,cstr); + + for (i = 0;i < 32;++i) k[i] = (hash[32 + i] & ~result); + return result; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/enc.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/enc.c new file mode 100644 index 000000000..68f4fce15 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/enc.c @@ -0,0 +1,49 @@ +#ifdef KAT +#include +#endif + +#include +#include "params.h" +#include "small.h" +#include "rq.h" +#include "crypto_hash_sha512.h" +#include "crypto_kem.h" + +int crypto_kem_enc( + unsigned char *cstr, + unsigned char *k, + const unsigned char *pk +) +{ + small r[p]; + modq h[p]; + modq c[p]; + unsigned char rstr[small_encode_len]; + unsigned char hash[64]; + + small_random_weightw(r); + +#ifdef KAT + { + int i; + printf("encrypt r:"); + for (i = 0;i < p;++i) + if (r[i] == 1) printf(" +%d",i); + else if (r[i] == -1) printf(" -%d",i); + printf("\n"); + } +#endif + + small_encode(rstr,r); + crypto_hash_sha512(hash,rstr,sizeof rstr); + + rq_decode(h,pk); + rq_mult(c,h,r); + rq_round3(c,c); + + memcpy(k,hash + 32,32); + memcpy(cstr,hash,32); + rq_encoderounded(cstr + 32,c); + + return 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/implementors b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/implementors new file mode 100644 index 000000000..51ac31ea2 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/implementors @@ -0,0 +1,5 @@ +Alphabetical order: +Daniel J. Bernstein +Chitchanok Chuengsatiansup +Tanja Lange +Christine van Vredendaal diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.c new file mode 100644 index 000000000..f24441108 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.c @@ -0,0 +1,35 @@ +#include "int32_sort.h" +#include "crypto_uint32.h" + +static void minmax(crypto_int32 *x,crypto_int32 *y) +{ + crypto_uint32 xi = *x; + crypto_uint32 yi = *y; + crypto_uint32 xy = xi ^ yi; + crypto_uint32 c = yi - xi; + c ^= xy & (c ^ yi); + c >>= 31; + c = -c; + c &= xy; + *x = xi ^ c; + *y = yi ^ c; +} + +void int32_sort(crypto_int32 *x,int n) +{ + int top,p,q,i; + + if (n < 2) return; + top = 1; + while (top < n - top) top += top; + + for (p = top;p > 0;p >>= 1) { + for (i = 0;i < n - p;++i) + if (!(i & p)) + minmax(x + i,x + i + p); + for (q = top;q > p;q >>= 1) + for (i = 0;i < n - q;++i) + if (!(i & p)) + minmax(x + i + p,x + i + q); + } +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.h new file mode 100644 index 000000000..345368d26 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef int32_sort_h +#define int32_sort_h + +#include "crypto_int32.h" + +#define int32_sort crypto_kem_sntrup4591761_ref_int32_sort +extern void int32_sort(crypto_int32 *,int); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/keypair.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/keypair.c new file mode 100644 index 000000000..25b768bcc --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/keypair.c @@ -0,0 +1,39 @@ +#include +#include "modq.h" +#include "params.h" +#include "r3.h" +#include "small.h" +#include "rq.h" +#include "crypto_kem.h" + +#if crypto_kem_PUBLICKEYBYTES != rq_encode_len +#error "crypto_kem_PUBLICKEYBYTES must match rq_encode_len" +#endif +#if crypto_kem_SECRETKEYBYTES != rq_encode_len + 2 * small_encode_len +#error "crypto_kem_SECRETKEYBYTES must match rq_encode_len + 2 * small_encode_len" +#endif + +int crypto_kem_keypair(unsigned char *pk,unsigned char *sk) +{ + small g[p]; + small grecip[p]; + small f[p]; + modq f3recip[p]; + modq h[p]; + + do + small_random(g); + while (r3_recip(grecip,g) != 0); + + small_random_weightw(f); + rq_recip3(f3recip,f); + + rq_mult(h,f3recip,g); + + rq_encode(pk,h); + small_encode(sk,f); + small_encode(sk + small_encode_len,grecip); + memcpy(sk + 2 * small_encode_len,pk,rq_encode_len); + + return 0; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/mod3.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/mod3.h new file mode 100644 index 000000000..c51f2edd9 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/mod3.h @@ -0,0 +1,60 @@ +#ifndef mod3_h +#define mod3_h + +#include "small.h" +#include "crypto_int32.h" + +/* -1 if x is nonzero, 0 otherwise */ +static inline int mod3_nonzero_mask(small x) +{ + return -x*x; +} + +/* input between -100000 and 100000 */ +/* output between -1 and 1 */ +static inline small mod3_freeze(crypto_int32 a) +{ + a -= 3 * ((10923 * a) >> 15); + a -= 3 * ((89478485 * a + 134217728) >> 28); + return a; +} + +static inline small mod3_minusproduct(small a,small b,small c) +{ + crypto_int32 A = a; + crypto_int32 B = b; + crypto_int32 C = c; + return mod3_freeze(A - B * C); +} + +static inline small mod3_plusproduct(small a,small b,small c) +{ + crypto_int32 A = a; + crypto_int32 B = b; + crypto_int32 C = c; + return mod3_freeze(A + B * C); +} + +static inline small mod3_product(small a,small b) +{ + return a * b; +} + +static inline small mod3_sum(small a,small b) +{ + crypto_int32 A = a; + crypto_int32 B = b; + return mod3_freeze(A + B); +} + +static inline small mod3_reciprocal(small a1) +{ + return a1; +} + +static inline small mod3_quotient(small num,small den) +{ + return mod3_product(num,mod3_reciprocal(den)); +} + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/modq.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/modq.h new file mode 100644 index 000000000..a7d26b70a --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/modq.h @@ -0,0 +1,92 @@ +#ifndef modq_h +#define modq_h + +#include "crypto_int16.h" +#include "crypto_int32.h" +#include "crypto_uint16.h" +#include "crypto_uint32.h" + +typedef crypto_int16 modq; + +/* -1 if x is nonzero, 0 otherwise */ +static inline int modq_nonzero_mask(modq x) +{ + crypto_int32 r = (crypto_uint16) x; + r = -r; + r >>= 30; + return r; +} + +/* input between -9000000 and 9000000 */ +/* output between -2295 and 2295 */ +static inline modq modq_freeze(crypto_int32 a) +{ + a -= 4591 * ((228 * a) >> 20); + a -= 4591 * ((58470 * a + 134217728) >> 28); + return a; +} + +static inline modq modq_minusproduct(modq a,modq b,modq c) +{ + crypto_int32 A = a; + crypto_int32 B = b; + crypto_int32 C = c; + return modq_freeze(A - B * C); +} + +static inline modq modq_plusproduct(modq a,modq b,modq c) +{ + crypto_int32 A = a; + crypto_int32 B = b; + crypto_int32 C = c; + return modq_freeze(A + B * C); +} + +static inline modq modq_product(modq a,modq b) +{ + crypto_int32 A = a; + crypto_int32 B = b; + return modq_freeze(A * B); +} + +static inline modq modq_square(modq a) +{ + crypto_int32 A = a; + return modq_freeze(A * A); +} + +static inline modq modq_sum(modq a,modq b) +{ + crypto_int32 A = a; + crypto_int32 B = b; + return modq_freeze(A + B); +} + +static inline modq modq_reciprocal(modq a1) +{ + modq a2 = modq_square(a1); + modq a3 = modq_product(a2,a1); + modq a4 = modq_square(a2); + modq a8 = modq_square(a4); + modq a16 = modq_square(a8); + modq a32 = modq_square(a16); + modq a35 = modq_product(a32,a3); + modq a70 = modq_square(a35); + modq a140 = modq_square(a70); + modq a143 = modq_product(a140,a3); + modq a286 = modq_square(a143); + modq a572 = modq_square(a286); + modq a1144 = modq_square(a572); + modq a1147 = modq_product(a1144,a3); + modq a2294 = modq_square(a1147); + modq a4588 = modq_square(a2294); + modq a4589 = modq_product(a4588,a1); + return a4589; +} + +static inline modq modq_quotient(modq num,modq den) +{ + return modq_product(num,modq_reciprocal(den)); +} + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/params.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/params.h new file mode 100644 index 000000000..655e6ec09 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/params.h @@ -0,0 +1,14 @@ +#ifndef params_h +#define params_h + +#define q 4591 +/* XXX: also built into modq in various ways */ + +#define qshift 2295 +#define p 761 +#define w 286 + +#define rq_encode_len 1218 +#define small_encode_len 191 + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3.h new file mode 100644 index 000000000..4308dd935 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3.h @@ -0,0 +1,12 @@ +#ifndef r3_h +#define r3_h + +#include "small.h" + +#define r3_mult crypto_kem_sntrup4591761_ref_r3_mult +extern void r3_mult(small *,const small *,const small *); + +#define r3_recip crypto_kem_sntrup4591761_ref_r3_recip +extern int r3_recip(small *,const small *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_mult.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_mult.c new file mode 100644 index 000000000..0a4273dce --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_mult.c @@ -0,0 +1,31 @@ +#include "params.h" +#include "mod3.h" +#include "r3.h" + +void r3_mult(small *h,const small *f,const small *g) +{ + small fg[p + p - 1]; + small result; + int i, j; + + for (i = 0;i < p;++i) { + result = 0; + for (j = 0;j <= i;++j) + result = mod3_plusproduct(result,f[j],g[i - j]); + fg[i] = result; + } + for (i = p;i < p + p - 1;++i) { + result = 0; + for (j = i - p + 1;j < p;++j) + result = mod3_plusproduct(result,f[j],g[i - j]); + fg[i] = result; + } + + for (i = p + p - 2;i >= p;--i) { + fg[i - p] = mod3_sum(fg[i - p],fg[i]); + fg[i - p + 1] = mod3_sum(fg[i - p + 1],fg[i]); + } + + for (i = 0;i < p;++i) + h[i] = fg[i]; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_recip.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_recip.c new file mode 100644 index 000000000..3c56b7fe9 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/r3_recip.c @@ -0,0 +1,126 @@ +#include "params.h" +#include "mod3.h" +#include "swap.h" +#include "r3.h" + +/* caller must ensure that x-y does not overflow */ +static int smaller_mask(int x,int y) +{ + return (x - y) >> 31; +} + +static void vectormod3_product(small *z,int len,const small *x,const small c) +{ + int i; + for (i = 0;i < len;++i) z[i] = mod3_product(x[i],c); +} + +static void vectormod3_minusproduct(small *z,int len,const small *x,const small *y,const small c) +{ + int i; + for (i = 0;i < len;++i) z[i] = mod3_minusproduct(x[i],y[i],c); +} + +static void vectormod3_shift(small *z,int len) +{ + int i; + for (i = len - 1;i > 0;--i) z[i] = z[i - 1]; + z[0] = 0; +} + +/* +r = s^(-1) mod m, returning 0, if s is invertible mod m +or returning -1 if s is not invertible mod m +r,s are polys of degree

= loops) break; + + c = mod3_quotient(g[p],f[p]); + + vectormod3_minusproduct(g,p + 1,g,f,c); + vectormod3_shift(g,p + 1); + +#ifdef SIMPLER + vectormod3_minusproduct(v,loops + 1,v,u,c); + vectormod3_shift(v,loops + 1); +#else + if (loop < p) { + vectormod3_minusproduct(v,loop + 1,v,u,c); + vectormod3_shift(v,loop + 2); + } else { + vectormod3_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c); + vectormod3_shift(v + loop - p,p + 2); + } +#endif + + e -= 1; + + ++loop; + + swapmask = smaller_mask(e,d) & mod3_nonzero_mask(g[p]); + swap(&e,&d,sizeof e,swapmask); + swap(f,g,(p + 1) * sizeof(small),swapmask); + +#ifdef SIMPLER + swap(u,v,(loops + 1) * sizeof(small),swapmask); +#else + if (loop < p) { + swap(u,v,(loop + 1) * sizeof(small),swapmask); + } else { + swap(u + loop - p,v + loop - p,(p + 1) * sizeof(small),swapmask); + } +#endif + } + + c = mod3_reciprocal(f[p]); + vectormod3_product(r,p,u + p,c); + return smaller_mask(0,d); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/random32.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/random32.c new file mode 100644 index 000000000..3cf8e38aa --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/random32.c @@ -0,0 +1,24 @@ +#include "randombytes.h" +#include "small.h" + +#ifdef KAT +/* NIST KAT generator fails to provide chunk-independence */ +static unsigned char x[4*761]; +static long long pos = 4*761; +#endif + +crypto_int32 small_random32(void) +{ +#ifdef KAT + if (pos == 4*761) { + randombytes(x,sizeof x); + pos = 0; + } + pos += 4; + return x[pos - 4] + (x[pos - 3] << 8) + (x[pos - 2] << 16) + (x[pos - 1] << 24); +#else + unsigned char x[4]; + randombytes(x,4); + return x[0] + (x[1] << 8) + (x[2] << 16) + (x[3] << 24); +#endif +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomsmall.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomsmall.c new file mode 100644 index 000000000..f0a226c57 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomsmall.c @@ -0,0 +1,14 @@ +#include "params.h" +#include "randombytes.h" +#include "crypto_uint32.h" +#include "small.h" + +void small_random(small *g) +{ + int i; + + for (i = 0;i < p;++i) { + crypto_uint32 r = small_random32(); + g[i] = (small) (((1073741823 & r) * 3) >> 30) - 1; + } +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomweightw.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomweightw.c new file mode 100644 index 000000000..5aa2c09df --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/randomweightw.c @@ -0,0 +1,16 @@ +#include "params.h" +#include "randombytes.h" +#include "int32_sort.h" +#include "small.h" + +void small_random_weightw(small *f) +{ + crypto_int32 r[p]; + int i; + + for (i = 0;i < p;++i) r[i] = small_random32(); + for (i = 0;i < w;++i) r[i] &= -2; + for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1; + int32_sort(r,p); + for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.c new file mode 100644 index 000000000..a115ff9dd --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.c @@ -0,0 +1,128 @@ +#include "params.h" +#include "crypto_uint32.h" +#include "rq.h" + +void rq_encode(unsigned char *c,const modq *f) +{ + crypto_int32 f0, f1, f2, f3, f4; + int i; + + for (i = 0;i < p/5;++i) { + f0 = *f++ + qshift; + f1 = *f++ + qshift; + f2 = *f++ + qshift; + f3 = *f++ + qshift; + f4 = *f++ + qshift; + /* now want f0 + 6144*f1 + ... as a 64-bit integer */ + f1 *= 3; + f2 *= 9; + f3 *= 27; + f4 *= 81; + /* now want f0 + f1<<11 + f2<<22 + f3<<33 + f4<<44 */ + f0 += f1 << 11; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + f0 += f2 << 6; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + f0 += f3 << 1; + *c++ = f0; f0 >>= 8; + f0 += f4 << 4; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + *c++ = f0; + } + /* XXX: using p mod 5 = 1 */ + f0 = *f++ + qshift; + *c++ = f0; f0 >>= 8; + *c++ = f0; +} + +void rq_decode(modq *f,const unsigned char *c) +{ + crypto_uint32 c0, c1, c2, c3, c4, c5, c6, c7; + crypto_uint32 f0, f1, f2, f3, f4; + int i; + + for (i = 0;i < p/5;++i) { + c0 = *c++; + c1 = *c++; + c2 = *c++; + c3 = *c++; + c4 = *c++; + c5 = *c++; + c6 = *c++; + c7 = *c++; + + /* f0 + f1*6144 + f2*6144^2 + f3*6144^3 + f4*6144^4 */ + /* = c0 + c1*256 + ... + c6*256^6 + c7*256^7 */ + /* with each f between 0 and 4590 */ + + c6 += c7 << 8; + /* c6 <= 23241 = floor(4591*6144^4/2^48) */ + /* f4 = (16/81)c6 + (1/1296)(c5+[0,1]) - [0,0.75] */ + /* claim: 2^19 f4 < x < 2^19(f4+1) */ + /* where x = 103564 c6 + 405(c5+1) */ + /* proof: x - 2^19 f4 = (76/81)c6 + (37/81)c5 + 405 - (32768/81)[0,1] + 2^19[0,0.75] */ + /* at least 405 - 32768/81 > 0 */ + /* at most (76/81)23241 + (37/81)255 + 405 + 2^19 0.75 < 2^19 */ + f4 = (103564*c6 + 405*(c5+1)) >> 19; + + c5 += c6 << 8; + c5 -= (f4 * 81) << 4; + c4 += c5 << 8; + + /* f0 + f1*6144 + f2*6144^2 + f3*6144^3 */ + /* = c0 + c1*256 + c2*256^2 + c3*256^3 + c4*256^4 */ + /* c4 <= 247914 = floor(4591*6144^3/2^32) */ + /* f3 = (1/54)(c4+[0,1]) - [0,0.75] */ + /* claim: 2^19 f3 < x < 2^19(f3+1) */ + /* where x = 9709(c4+2) */ + /* proof: x - 2^19 f3 = 19418 - (1/27)c4 - (262144/27)[0,1] + 2^19[0,0.75] */ + /* at least 19418 - 247914/27 - 262144/27 > 0 */ + /* at most 19418 + 2^19 0.75 < 2^19 */ + f3 = (9709*(c4+2)) >> 19; + + c4 -= (f3 * 27) << 1; + c3 += c4 << 8; + /* f0 + f1*6144 + f2*6144^2 */ + /* = c0 + c1*256 + c2*256^2 + c3*256^3 */ + /* c3 <= 10329 = floor(4591*6144^2/2^24) */ + /* f2 = (4/9)c3 + (1/576)c2 + (1/147456)c1 + (1/37748736)c0 - [0,0.75] */ + /* claim: 2^19 f2 < x < 2^19(f2+1) */ + /* where x = 233017 c3 + 910(c2+2) */ + /* proof: x - 2^19 f2 = 1820 + (1/9)c3 - (2/9)c2 - (32/9)c1 - (1/72)c0 + 2^19[0,0.75] */ + /* at least 1820 - (2/9)255 - (32/9)255 - (1/72)255 > 0 */ + /* at most 1820 + (1/9)10329 + 2^19 0.75 < 2^19 */ + f2 = (233017*c3 + 910*(c2+2)) >> 19; + + c2 += c3 << 8; + c2 -= (f2 * 9) << 6; + c1 += c2 << 8; + /* f0 + f1*6144 */ + /* = c0 + c1*256 */ + /* c1 <= 110184 = floor(4591*6144/2^8) */ + /* f1 = (1/24)c1 + (1/6144)c0 - (1/6144)f0 */ + /* claim: 2^19 f1 < x < 2^19(f1+1) */ + /* where x = 21845(c1+2) + 85 c0 */ + /* proof: x - 2^19 f1 = 43690 - (1/3)c1 - (1/3)c0 + 2^19 [0,0.75] */ + /* at least 43690 - (1/3)110184 - (1/3)255 > 0 */ + /* at most 43690 + 2^19 0.75 < 2^19 */ + f1 = (21845*(c1+2) + 85*c0) >> 19; + + c1 -= (f1 * 3) << 3; + c0 += c1 << 8; + f0 = c0; + + *f++ = modq_freeze(f0 + q - qshift); + *f++ = modq_freeze(f1 + q - qshift); + *f++ = modq_freeze(f2 + q - qshift); + *f++ = modq_freeze(f3 + q - qshift); + *f++ = modq_freeze(f4 + q - qshift); + } + + c0 = *c++; + c1 = *c++; + c0 += c1 << 8; + *f++ = modq_freeze(c0 + q - qshift); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.h new file mode 100644 index 000000000..c8007896b --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq.h @@ -0,0 +1,28 @@ +#ifndef rq_h +#define rq_h + +#include "modq.h" +#include "small.h" + +#define rq_encode crypto_kem_sntrup4591761_ref_rq_encode +extern void rq_encode(unsigned char *,const modq *); + +#define rq_decode crypto_kem_sntrup4591761_ref_rq_decode +extern void rq_decode(modq *,const unsigned char *); + +#define rq_encoderounded crypto_kem_sntrup4591761_ref_rq_encoderounded +extern void rq_encoderounded(unsigned char *,const modq *); + +#define rq_decoderounded crypto_kem_sntrup4591761_ref_rq_decoderounded +extern void rq_decoderounded(modq *,const unsigned char *); + +#define rq_round3 crypto_kem_sntrup4591761_ref_rq_round +extern void rq_round3(modq *,const modq *); + +#define rq_mult crypto_kem_sntrup4591761_ref_rq_mult +extern void rq_mult(modq *,const modq *,const small *); + +#define rq_recip3 crypto_kem_sntrup4591761_ref_rq_recip3 +int rq_recip3(modq *,const small *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_mult.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_mult.c new file mode 100644 index 000000000..86dc7da03 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_mult.c @@ -0,0 +1,30 @@ +#include "params.h" +#include "rq.h" + +void rq_mult(modq *h,const modq *f,const small *g) +{ + modq fg[p + p - 1]; + modq result; + int i, j; + + for (i = 0;i < p;++i) { + result = 0; + for (j = 0;j <= i;++j) + result = modq_plusproduct(result,f[j],g[i - j]); + fg[i] = result; + } + for (i = p;i < p + p - 1;++i) { + result = 0; + for (j = i - p + 1;j < p;++j) + result = modq_plusproduct(result,f[j],g[i - j]); + fg[i] = result; + } + + for (i = p + p - 2;i >= p;--i) { + fg[i - p] = modq_sum(fg[i - p],fg[i]); + fg[i - p + 1] = modq_sum(fg[i - p + 1],fg[i]); + } + + for (i = 0;i < p;++i) + h[i] = fg[i]; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_recip3.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_recip3.c new file mode 100644 index 000000000..925b10fc6 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_recip3.c @@ -0,0 +1,125 @@ +#include "params.h" +#include "swap.h" +#include "rq.h" + +/* caller must ensure that x-y does not overflow */ +static int smaller_mask(int x,int y) +{ + return (x - y) >> 31; +} + +static void vectormodq_product(modq *z,int len,const modq *x,const modq c) +{ + int i; + for (i = 0;i < len;++i) z[i] = modq_product(x[i],c); +} + +static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,const modq c) +{ + int i; + for (i = 0;i < len;++i) z[i] = modq_minusproduct(x[i],y[i],c); +} + +static void vectormodq_shift(modq *z,int len) +{ + int i; + for (i = len - 1;i > 0;--i) z[i] = z[i - 1]; + z[0] = 0; +} + +/* +r = (3s)^(-1) mod m, returning 0, if s is invertible mod m +or returning -1 if s is not invertible mod m +r,s are polys of degree

= loops) break; + + c = modq_quotient(g[p],f[p]); + + vectormodq_minusproduct(g,p + 1,g,f,c); + vectormodq_shift(g,p + 1); + +#ifdef SIMPLER + vectormodq_minusproduct(v,loops + 1,v,u,c); + vectormodq_shift(v,loops + 1); +#else + if (loop < p) { + vectormodq_minusproduct(v,loop + 1,v,u,c); + vectormodq_shift(v,loop + 2); + } else { + vectormodq_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c); + vectormodq_shift(v + loop - p,p + 2); + } +#endif + + e -= 1; + + ++loop; + + swapmask = smaller_mask(e,d) & modq_nonzero_mask(g[p]); + swap(&e,&d,sizeof e,swapmask); + swap(f,g,(p + 1) * sizeof(modq),swapmask); + +#ifdef SIMPLER + swap(u,v,(loops + 1) * sizeof(modq),swapmask); +#else + if (loop < p) { + swap(u,v,(loop + 1) * sizeof(modq),swapmask); + } else { + swap(u + loop - p,v + loop - p,(p + 1) * sizeof(modq),swapmask); + } +#endif + } + + c = modq_reciprocal(f[p]); + vectormodq_product(r,p,u + p,c); + return smaller_mask(0,d); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_round3.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_round3.c new file mode 100644 index 000000000..c972e8e4e --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_round3.c @@ -0,0 +1,10 @@ +#include "params.h" +#include "rq.h" + +void rq_round3(modq *h,const modq *f) +{ + int i; + + for (i = 0;i < p;++i) + h[i] = ((21846 * (f[i] + 2295) + 32768) >> 16) * 3 - 2295; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_rounded.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_rounded.c new file mode 100644 index 000000000..04c75f324 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/rq_rounded.c @@ -0,0 +1,101 @@ +#include "params.h" +#include "crypto_uint32.h" +#include "rq.h" + +void rq_encoderounded(unsigned char *c,const modq *f) +{ + crypto_int32 f0, f1, f2; + int i; + + for (i = 0;i < p/3;++i) { + f0 = *f++ + qshift; + f1 = *f++ + qshift; + f2 = *f++ + qshift; + f0 = (21846 * f0) >> 16; + f1 = (21846 * f1) >> 16; + f2 = (21846 * f2) >> 16; + /* now want f0 + f1*1536 + f2*1536^2 as a 32-bit integer */ + f2 *= 3; + f1 += f2 << 9; + f1 *= 3; + f0 += f1 << 9; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + *c++ = f0; + } + /* XXX: using p mod 3 = 2 */ + f0 = *f++ + qshift; + f1 = *f++ + qshift; + f0 = (21846 * f0) >> 16; + f1 = (21846 * f1) >> 16; + f1 *= 3; + f0 += f1 << 9; + *c++ = f0; f0 >>= 8; + *c++ = f0; f0 >>= 8; + *c++ = f0; +} + +void rq_decoderounded(modq *f,const unsigned char *c) +{ + crypto_uint32 c0, c1, c2, c3; + crypto_uint32 f0, f1, f2; + int i; + + for (i = 0;i < p/3;++i) { + c0 = *c++; + c1 = *c++; + c2 = *c++; + c3 = *c++; + + /* f0 + f1*1536 + f2*1536^2 */ + /* = c0 + c1*256 + c2*256^2 + c3*256^3 */ + /* with each f between 0 and 1530 */ + + /* f2 = (64/9)c3 + (1/36)c2 + (1/9216)c1 + (1/2359296)c0 - [0,0.99675] */ + /* claim: 2^21 f2 < x < 2^21(f2+1) */ + /* where x = 14913081*c3 + 58254*c2 + 228*(c1+2) */ + /* proof: x - 2^21 f2 = 456 - (8/9)c0 + (4/9)c1 - (2/9)c2 + (1/9)c3 + 2^21 [0,0.99675] */ + /* at least 456 - (8/9)255 - (2/9)255 > 0 */ + /* at most 456 + (4/9)255 + (1/9)255 + 2^21 0.99675 < 2^21 */ + f2 = (14913081*c3 + 58254*c2 + 228*(c1+2)) >> 21; + + c2 += c3 << 8; + c2 -= (f2 * 9) << 2; + /* f0 + f1*1536 */ + /* = c0 + c1*256 + c2*256^2 */ + /* c2 <= 35 = floor((1530+1530*1536)/256^2) */ + /* f1 = (128/3)c2 + (1/6)c1 + (1/1536)c0 - (1/1536)f0 */ + /* claim: 2^21 f1 < x < 2^21(f1+1) */ + /* where x = 89478485*c2 + 349525*c1 + 1365*(c0+1) */ + /* proof: x - 2^21 f1 = 1365 - (1/3)c2 - (1/3)c1 - (1/3)c0 + (4096/3)f0 */ + /* at least 1365 - (1/3)35 - (1/3)255 - (1/3)255 > 0 */ + /* at most 1365 + (4096/3)1530 < 2^21 */ + f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21; + + c1 += c2 << 8; + c1 -= (f1 * 3) << 1; + + c0 += c1 << 8; + f0 = c0; + + *f++ = modq_freeze(f0 * 3 + q - qshift); + *f++ = modq_freeze(f1 * 3 + q - qshift); + *f++ = modq_freeze(f2 * 3 + q - qshift); + } + + c0 = *c++; + c1 = *c++; + c2 = *c++; + + f1 = (89478485*c2 + 349525*c1 + 1365*(c0+1)) >> 21; + + c1 += c2 << 8; + c1 -= (f1 * 3) << 1; + + c0 += c1 << 8; + f0 = c0; + + *f++ = modq_freeze(f0 * 3 + q - qshift); + *f++ = modq_freeze(f1 * 3 + q - qshift); +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.c new file mode 100644 index 000000000..270dcbe28 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.c @@ -0,0 +1,37 @@ +#include "params.h" +#include "small.h" + +/* XXX: these functions rely on p mod 4 = 1 */ + +/* all coefficients in -1, 0, 1 */ +void small_encode(unsigned char *c,const small *f) +{ + small c0; + int i; + + for (i = 0;i < p/4;++i) { + c0 = *f++ + 1; + c0 += (*f++ + 1) << 2; + c0 += (*f++ + 1) << 4; + c0 += (*f++ + 1) << 6; + *c++ = c0; + } + c0 = *f++ + 1; + *c++ = c0; +} + +void small_decode(small *f,const unsigned char *c) +{ + unsigned char c0; + int i; + + for (i = 0;i < p/4;++i) { + c0 = *c++; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2; + *f++ = ((small) (c0 & 3)) - 1; + } + c0 = *c++; + *f++ = ((small) (c0 & 3)) - 1; +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.h new file mode 100644 index 000000000..b0b931194 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/small.h @@ -0,0 +1,24 @@ +#ifndef small_h +#define small_h + +#include "crypto_int8.h" +#include "crypto_int32.h" + +typedef crypto_int8 small; + +#define small_encode crypto_kem_sntrup4591761_ref_small_encode +extern void small_encode(unsigned char *,const small *); + +#define small_decode crypto_kem_sntrup4591761_ref_small_decode +extern void small_decode(small *,const unsigned char *); + +#define small_random32 crypto_kem_sntrup4591761_ref_small_random32 +extern crypto_int32 small_random32(void); + +#define small_random crypto_kem_sntrup4591761_ref_small_random +extern void small_random(small *); + +#define small_random_weightw crypto_kem_sntrup4591761_ref_small_random_weightw +extern void small_random_weightw(small *); + +#endif diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/swap.c b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/swap.c new file mode 100644 index 000000000..76cfb2d07 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/swap.c @@ -0,0 +1,19 @@ +#include "swap.h" + +void swap(void *x,void *y,int bytes,int mask) +{ + int i; + char xi, yi, c, t; + + c = mask; + + for (i = 0;i < bytes;++i) { + xi = i[(char *) x]; + yi = i[(char *) y]; + t = c & (xi ^ yi); + xi ^= t; + yi ^= t; + i[(char *) x] = xi; + i[(char *) y] = yi; + } +} diff --git a/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/swap.h b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/swap.h new file mode 100644 index 000000000..fe13c4e52 --- /dev/null +++ b/crypto/ntruprime-20171206/crypto_kem/sntrup4591761/ref/swap.h @@ -0,0 +1,7 @@ +#ifndef swap_h +#define swap_h + +#define swap crypto_kem_sntrup4591761_ref_swap +extern void swap(void *,void *,int,int); + +#endif