mirror of https://github.com/oxen-io/lokinet
restructure original code into libntrup
parent
a60c4b0bef
commit
8eacffca09
@ -0,0 +1 @@
|
||||
#include <sodium/crypto_hash_sha512.h>
|
@ -0,0 +1,2 @@
|
||||
#include <stdint.h>
|
||||
typedef int16_t crypto_int16;
|
@ -0,0 +1,2 @@
|
||||
#include <stdint.h>
|
||||
typedef int32_t crypto_int32;
|
@ -0,0 +1,2 @@
|
||||
#include <stdint.h>
|
||||
typedef int64_t crypto_int64;
|
@ -0,0 +1,2 @@
|
||||
#include <stdint.h>
|
||||
typedef int8_t crypto_int8;
|
@ -0,0 +1 @@
|
||||
#include <libntrup/ntru.h>
|
@ -0,0 +1,2 @@
|
||||
#include <stdint.h>
|
||||
typedef uint16_t crypto_uint16;
|
@ -0,0 +1,2 @@
|
||||
#include <stdint.h>
|
||||
typedef uint32_t crypto_uint32;
|
@ -0,0 +1 @@
|
||||
#include <sodium/crypto_verify_32.h>
|
@ -0,0 +1,26 @@
|
||||
#ifndef LIBNTRUP_NTRU_H
|
||||
#define LIBNTRUP_NTRU_H
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "ntru_api.h"
|
||||
|
||||
void ntru_init(void);
|
||||
|
||||
|
||||
int crypto_kem_enc(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
|
||||
|
||||
int crypto_kem_dec(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
|
||||
|
||||
int crypto_kem_keypair(unsigned char *pk, unsigned char * sk);
|
||||
|
||||
#define crypto_kem_SECRETKEYBYTES 1600
|
||||
#define crypto_kem_PUBLICKEYBYTES 1218
|
||||
#define crypto_kem_CIPHERTEXTBYTES 1047
|
||||
#define CRYPTO_BYTES 32
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
@ -0,0 +1,17 @@
|
||||
|
||||
#define NTRU_SECRETKEYBYTES CRYPTO_SECRETKEYBYTES
|
||||
#define NTRU_PUBLICKEYBYTES CRYPTO_PUBLICKEYBYTES
|
||||
#define NTRU_CIPHERTEXTBYTES CRYPTO_CIPHERTEXTBYTES
|
||||
|
||||
|
||||
int crypto_kem_enc_ref(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
|
||||
|
||||
int crypto_kem_dec_ref(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
|
||||
|
||||
int crypto_kem_keypair_ref(unsigned char *pk, unsigned char * sk);
|
||||
|
||||
int crypto_kem_enc_avx2(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
|
||||
|
||||
int crypto_kem_dec_avx2(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
|
||||
|
||||
int crypto_kem_keypair_avx2(unsigned char *pk, unsigned char * sk);
|
@ -0,0 +1 @@
|
||||
#include <sodium/randombytes.h>
|
@ -0,0 +1,32 @@
|
||||
crypto/libntrup/src/ref/randomsmall.c
|
||||
crypto/libntrup/src/ref/swap.c
|
||||
crypto/libntrup/src/ref/rq_round3.c
|
||||
crypto/libntrup/src/ref/rq_recip3.c
|
||||
crypto/libntrup/src/ref/small.c
|
||||
crypto/libntrup/src/ref/rq_mult.c
|
||||
crypto/libntrup/src/ref/randomweightw.c
|
||||
crypto/libntrup/src/ref/random32.c
|
||||
crypto/libntrup/src/ref/dec.c
|
||||
crypto/libntrup/src/ref/r3_mult.c
|
||||
crypto/libntrup/src/ref/r3_recip.c
|
||||
crypto/libntrup/src/ref/keypair.c
|
||||
crypto/libntrup/src/ref/rq_rounded.c
|
||||
crypto/libntrup/src/ref/enc.c
|
||||
crypto/libntrup/src/ref/int32_sort.c
|
||||
crypto/libntrup/src/ref/rq.c
|
||||
crypto/libntrup/src/avx/randomsmall.c
|
||||
crypto/libntrup/src/avx/weight.c
|
||||
crypto/libntrup/src/avx/swap.c
|
||||
crypto/libntrup/src/avx/rq_round3.c
|
||||
crypto/libntrup/src/avx/rq_recip3.c
|
||||
crypto/libntrup/src/avx/small.c
|
||||
crypto/libntrup/src/avx/randomweightw.c
|
||||
crypto/libntrup/src/avx/dec.c
|
||||
crypto/libntrup/src/avx/r3_recip.c
|
||||
crypto/libntrup/src/avx/keypair.c
|
||||
crypto/libntrup/src/avx/rq_rounded.c
|
||||
crypto/libntrup/src/avx/mult.c
|
||||
crypto/libntrup/src/avx/enc.c
|
||||
crypto/libntrup/src/avx/int32_sort.c
|
||||
crypto/libntrup/src/avx/rq.c
|
||||
crypto/libntrup/src/avx/rq_mod3.c
|
@ -0,0 +1,65 @@
|
||||
#include <libntrup/ntru.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
|
||||
#if defined(__x86_64__) || defined(__i386__)
|
||||
#include <cpuid.h>
|
||||
|
||||
static bool supports_avx2()
|
||||
{
|
||||
int cpuinfo[4] = {-1};
|
||||
__cpuid(0, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
|
||||
if(cpuinfo[0] < 7)
|
||||
return false;
|
||||
__cpuid(0, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
|
||||
return cpuinfo[1] & (1 << 5);
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
|
||||
static bool supports_avx2()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int (*__crypto_kem_enc)(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
|
||||
|
||||
int (*__crypto_kem_dec)(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
|
||||
|
||||
int (*__crypto_kem_keypair)(unsigned char *pk, unsigned char * sk);
|
||||
|
||||
void ntru_init()
|
||||
{
|
||||
if(supports_avx2())
|
||||
{
|
||||
__crypto_kem_dec = crypto_kem_dec_avx2;
|
||||
__crypto_kem_enc = crypto_kem_enc_avx2;
|
||||
__crypto_kem_dec = crypto_kem_dec_avx2;
|
||||
}
|
||||
else
|
||||
{
|
||||
__crypto_kem_dec = crypto_kem_dec_ref;
|
||||
__crypto_kem_enc = crypto_kem_enc_ref;
|
||||
__crypto_kem_dec = crypto_kem_dec_ref;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int crypto_kem_enc(unsigned char *cstr, unsigned char *k, const unsigned char *pk)
|
||||
{
|
||||
return __crypto_kem_enc(cstr, k, pk);
|
||||
}
|
||||
|
||||
int crypto_kem_dec(unsigned char *k, const unsigned char *cstr, const unsigned char *sk)
|
||||
{
|
||||
return __crypto_kem_dec(k, cstr, sk);
|
||||
}
|
||||
|
||||
int crypto_kem_keypair(unsigned char *pk, unsigned char * sk)
|
||||
{
|
||||
return __crypto_kem_keypair(pk, sk);
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
#define CRYPTO_SECRETKEYBYTES 1238
|
||||
#define CRYPTO_PUBLICKEYBYTES 1047
|
||||
#define CRYPTO_CIPHERTEXTBYTES 1175
|
||||
#define CRYPTO_BYTES 32
|
@ -1,57 +0,0 @@
|
||||
#ifdef KAT
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#include "params.h"
|
||||
#include "small.h"
|
||||
#include "rq.h"
|
||||
#include "hide.h"
|
||||
#include "crypto_kem.h"
|
||||
|
||||
static int verify(const unsigned char *x,const unsigned char *y)
|
||||
{
|
||||
unsigned int differentbits = 0;
|
||||
int i;
|
||||
for (i = 0;i < crypto_kem_CIPHERTEXTBYTES;++i)
|
||||
differentbits |= x[i] ^ y[i];
|
||||
return (1 & ((differentbits - 1) >> 8)) - 1;
|
||||
}
|
||||
|
||||
int crypto_kem_dec(
|
||||
unsigned char *k,
|
||||
const unsigned char *cstr,
|
||||
const unsigned char *sk
|
||||
)
|
||||
{
|
||||
modq buf[768];
|
||||
#define B buf
|
||||
#define aB buf
|
||||
small a[768];
|
||||
unsigned char r[32];
|
||||
unsigned char checkcstr[crypto_kem_CIPHERTEXTBYTES];
|
||||
unsigned char maybek[32];
|
||||
int i;
|
||||
int result;
|
||||
|
||||
small_decode(a,sk); sk += small_encode_len;
|
||||
rq_decoderounded(B,cstr + 32);
|
||||
rq_mult(aB,B,a);
|
||||
|
||||
rq_rightsubbit(r,cstr + 32 + rq_encoderounded_len,aB);
|
||||
|
||||
#ifdef KAT
|
||||
{
|
||||
int j;
|
||||
printf("decrypt r: ");
|
||||
for (j = 0;j < 32;++j)
|
||||
printf("%02x",255 & (int) r[j]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
hide(checkcstr,maybek,sk,r);
|
||||
result = verify(cstr,checkcstr);
|
||||
|
||||
for (i = 0;i < 32;++i) k[i] = maybek[i] & ~result;
|
||||
return result;
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
#ifdef KAT
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#include "hide.h"
|
||||
#include "randombytes.h"
|
||||
#include "crypto_kem.h"
|
||||
|
||||
int crypto_kem_enc(
|
||||
unsigned char *cstr,
|
||||
unsigned char *k,
|
||||
const unsigned char *pk
|
||||
)
|
||||
{
|
||||
unsigned char r[32];
|
||||
randombytes(r,32);
|
||||
|
||||
#ifdef KAT
|
||||
{
|
||||
int i;
|
||||
printf("encrypt r: ");
|
||||
for (i = 0;i < 32;++i)
|
||||
printf("%02x",255 & (int) r[i]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
hide(cstr,k,pk,r);
|
||||
return 0;
|
||||
}
|
@ -1,40 +0,0 @@
|
||||
#include <string.h>
|
||||
#include "crypto_hash_sha512.h"
|
||||
#include "crypto_kem.h"
|
||||
#include "params.h"
|
||||
#include "rq.h"
|
||||
#include "hide.h"
|
||||
|
||||
#if crypto_kem_CIPHERTEXTBYTES != rq_encoderounded_len + 32 + 128
|
||||
#error "crypto_kem_CIPHERTEXTBYTES must match rq_encoderounded_len + 32 + 128"
|
||||
#endif
|
||||
|
||||
void hide(unsigned char *cstr,unsigned char *k,const unsigned char *pk,const unsigned char *r)
|
||||
{
|
||||
modq buf[768];
|
||||
#define G buf
|
||||
#define A buf
|
||||
#define B buf
|
||||
#define C buf
|
||||
unsigned char k12[64];
|
||||
unsigned char k34[64];
|
||||
small b[768];
|
||||
|
||||
crypto_hash_sha512(k12,r,32);
|
||||
small_seeded_weightw(b,k12);
|
||||
|
||||
crypto_hash_sha512(k34,k12 + 32,32);
|
||||
memcpy(cstr,k34,32); cstr += 32;
|
||||
memcpy(k,k34 + 32,32);
|
||||
|
||||
rq_fromseed(G,pk);
|
||||
rq_mult(B,G,b);
|
||||
/* XXX: cache transform of b for next mult */
|
||||
/* XXX: cache transform of G inside sk */
|
||||
/* XXX: cache transform of G when pk is otherwise reused */
|
||||
rq_roundencode(cstr,B); cstr += rq_encoderounded_len;
|
||||
|
||||
rq_decoderounded(A,pk + 32);
|
||||
rq_mult(C,A,b);
|
||||
rq_top(cstr,C,r);
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
#ifndef hide_h
|
||||
#define hide_h
|
||||
|
||||
#include "crypto_int32.h"
|
||||
|
||||
#define hide crypto_kem_ntrulpr4591761_avx_hide
|
||||
extern void hide(unsigned char *,unsigned char *,const unsigned char *,const unsigned char *);
|
||||
|
||||
#endif
|
@ -1,9 +0,0 @@
|
||||
#ifndef int32_sort_h
|
||||
#define int32_sort_h
|
||||
|
||||
#include "crypto_int32.h"
|
||||
|
||||
#define int32_sort crypto_kem_ntrulpr4591761_avx_int32_sort
|
||||
extern void int32_sort(crypto_int32 *,int);
|
||||
|
||||
#endif
|
@ -1,37 +0,0 @@
|
||||
#include <string.h>
|
||||
#include "modq.h"
|
||||
#include "params.h"
|
||||
#include "small.h"
|
||||
#include "rq.h"
|
||||
#include "crypto_kem.h"
|
||||
#include "randombytes.h"
|
||||
#include "crypto_stream_aes256ctr.h"
|
||||
|
||||
#if crypto_kem_PUBLICKEYBYTES != rq_encoderounded_len + 32
|
||||
#error "crypto_kem_PUBLICKEYBYTES must match rq_encoderounded_len + 32"
|
||||
#endif
|
||||
#if crypto_kem_SECRETKEYBYTES != small_encode_len + crypto_kem_PUBLICKEYBYTES
|
||||
#error "crypto_kem_SECRETKEYBYTES must match small_encode_len + crypto_kem_PUBLICKEYBYTES"
|
||||
#endif
|
||||
|
||||
int crypto_kem_keypair(unsigned char *pk,unsigned char *sk)
|
||||
{
|
||||
modq buf[768];
|
||||
#define G buf
|
||||
#define A buf
|
||||
small a[768];
|
||||
|
||||
randombytes(pk,32);
|
||||
rq_fromseed(G,pk);
|
||||
|
||||
small_random_weightw(a);
|
||||
|
||||
rq_mult(A,G,a);
|
||||
|
||||
rq_roundencode(pk + 32,A);
|
||||
|
||||
small_encode(sk,a);
|
||||
memcpy(sk + small_encode_len,pk,crypto_kem_PUBLICKEYBYTES);
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,36 +0,0 @@
|
||||
#ifndef modq_h
|
||||
#define modq_h
|
||||
|
||||
#include "crypto_int16.h"
|
||||
#include "crypto_int32.h"
|
||||
#include "crypto_uint16.h"
|
||||
#include "crypto_uint32.h"
|
||||
|
||||
typedef crypto_int16 modq;
|
||||
|
||||
/* input between -9000000 and 9000000 */
|
||||
/* output between -2295 and 2295 */
|
||||
static inline modq modq_freeze(crypto_int32 a)
|
||||
{
|
||||
a -= 4591 * ((228 * a) >> 20);
|
||||
a -= 4591 * ((58470 * a + 134217728) >> 28);
|
||||
return a;
|
||||
}
|
||||
|
||||
/* input between 0 and 4294967295 */
|
||||
/* output = (input % 4591) - 2295 */
|
||||
static inline modq modq_fromuint32(crypto_uint32 a)
|
||||
{
|
||||
crypto_int32 r;
|
||||
r = (a & 524287) + (a >> 19) * 914; /* <= 8010861 */
|
||||
return modq_freeze(r - 2295);
|
||||
}
|
||||
|
||||
static inline modq modq_sum(modq a,modq b)
|
||||
{
|
||||
crypto_int32 A = a;
|
||||
crypto_int32 B = b;
|
||||
return modq_freeze(A + B);
|
||||
}
|
||||
|
||||
#endif
|
@ -1,738 +0,0 @@
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#include "rq.h"
|
||||
|
||||
#define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \
|
||||
gj = g[j]; \
|
||||
h0 += f0 * gj; \
|
||||
_mm256_storeu_ps(&h[i + j],h0); \
|
||||
h1 += f1 * gj; \
|
||||
h2 += f2 * gj; \
|
||||
h3 += f3 * gj; \
|
||||
h4 += f4 * gj; \
|
||||
h0 = _mm256_loadu_ps(&h[i + j + 5]); \
|
||||
h0 += f5 * gj;
|
||||
|
||||
#define MULSTEP_asm(j,h0,h1,h2,h3,h4) \
|
||||
gj = g[j]; \
|
||||
__asm__( \
|
||||
"vfmadd231ps %5,%6,%0 \n\t" \
|
||||
"vmovups %0,%12 \n\t" \
|
||||
"vmovups %13,%0 \n\t" \
|
||||
"vfmadd231ps %5,%7,%1 \n\t" \
|
||||
"vfmadd231ps %5,%8,%2 \n\t" \
|
||||
"vfmadd231ps %5,%9,%3 \n\t" \
|
||||
"vfmadd231ps %5,%10,%4 \n\t" \
|
||||
"vfmadd231ps %5,%11,%0 \n\t" \
|
||||
: "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
|
||||
: "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]),"m"(h[i+j+5]));
|
||||
|
||||
#define MULSTEP MULSTEP_asm
|
||||
|
||||
#define MULSTEP_noload(j,h0,h1,h2,h3,h4) \
|
||||
gj = g[j]; \
|
||||
__asm__( \
|
||||
"vfmadd231ps %5,%6,%0 \n\t" \
|
||||
"vmovups %0,%12 \n\t" \
|
||||
"vfmadd231ps %5,%7,%1 \n\t" \
|
||||
"vfmadd231ps %5,%8,%2 \n\t" \
|
||||
"vfmadd231ps %5,%9,%3 \n\t" \
|
||||
"vfmadd231ps %5,%10,%4 \n\t" \
|
||||
"vmulps %5,%11,%0 \n\t" \
|
||||
: "+x"(h0),"+x"(h1),"+x"(h2),"+x"(h3),"+x"(h4) \
|
||||
: "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
|
||||
|
||||
#define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \
|
||||
gj = g[j]; \
|
||||
__asm__( \
|
||||
"vmulps %5,%6,%0 \n\t" \
|
||||
"vmovups %0,%12 \n\t" \
|
||||
"vmulps %5,%7,%1 \n\t" \
|
||||
"vmulps %5,%8,%2 \n\t" \
|
||||
"vmulps %5,%9,%3 \n\t" \
|
||||
"vmulps %5,%10,%4 \n\t" \
|
||||
"vmulps %5,%11,%0 \n\t" \
|
||||
: "=&x"(h0),"=&x"(h1),"=&x"(h2),"=&x"(h3),"=&x"(h4) \
|
||||
: "x"(gj),"x"(f0),"x"(f1),"x"(f2),"x"(f3),"x"(f4),"x"(f5),"m"(h[i+j]));
|
||||
|
||||
static inline __m128i _mm_load_cvtepi8_epi16(const long long *x)
|
||||
{
|
||||
__m128i result;
|
||||
__asm__("vpmovsxbw %1, %0" : "=x"(result) : "m"(*x));
|
||||
return result;
|
||||
}
|
||||
|
||||
#define v0 _mm256_set1_epi32(0)
|
||||
#define v0_128 _mm_set1_epi32(0)
|
||||
#define v7 _mm256_set1_epi16(7)
|
||||
#define v4591_16 _mm256_set1_epi16(4591)
|
||||
#define v2296_16 _mm256_set1_epi16(2296)
|
||||
|
||||
#define alpha_32 _mm256_set1_epi32(0x4b400000)
|
||||
#define alpha_32_128 _mm_set1_epi32(0x4b400000)
|
||||
#define alpha_float _mm256_set1_ps(12582912.0)
|
||||
|
||||
#define v0_float _mm256_set1_ps(0)
|
||||
#define v1_float _mm256_set1_ps(1)
|
||||
#define vm1_float _mm256_set1_ps(-1)
|
||||
#define vm4591_float _mm256_set1_ps(-4591)
|
||||
#define recip4591_float _mm256_set1_ps(0.00021781746896101067305597908952297974298)
|
||||
|
||||
static inline __m256 add(__m256 x,__m256 y)
|
||||
{
|
||||
return x + y;
|
||||
}
|
||||
|
||||
static inline __m256 fastadd(__m256 x,__m256 y)
|
||||
{
|
||||
return _mm256_fmadd_ps(y,v1_float,x);
|
||||
}
|
||||
|
||||
static inline __m256 fastsub(__m256 x,__m256 y)
|
||||
{
|
||||
return _mm256_fmadd_ps(y,vm1_float,x);
|
||||
}
|
||||
|
||||
static inline __m256 reduce(__m256 x)
|
||||
{
|
||||
__m256 q = x * recip4591_float;
|
||||
q = _mm256_round_ps(q,8);
|
||||
return _mm256_fmadd_ps(q,vm4591_float,x);
|
||||
}
|
||||
|
||||
static inline __m256i squeeze(__m256i x)
|
||||
{
|
||||
__m256i q = _mm256_mulhrs_epi16(x,v7);
|
||||
q = _mm256_mullo_epi16(q,v4591_16);
|
||||
return _mm256_sub_epi16(x,q);
|
||||
}
|
||||
|
||||
static inline __m256i squeezeadd16(__m256i x,__m256i y)
|
||||
{
|
||||
__m256i q;
|
||||
x = _mm256_add_epi16(x,y);
|
||||
q = _mm256_mulhrs_epi16(x,v7);
|
||||
q = _mm256_mullo_epi16(q,v4591_16);
|
||||
return _mm256_sub_epi16(x,q);
|
||||
}
|
||||
|
||||
static inline __m256i freeze(__m256i x)
|
||||
{
|
||||
__m256i mask, x2296, x4591;
|
||||
x4591 = _mm256_add_epi16(x,v4591_16);
|
||||
mask = _mm256_srai_epi16(x,15);
|
||||
x = _mm256_blendv_epi8(x,x4591,mask);
|
||||
x2296 = _mm256_sub_epi16(x,v2296_16);
|
||||
mask = _mm256_srai_epi16(x2296,15);
|
||||
x4591 = _mm256_sub_epi16(x,v4591_16);
|
||||
x = _mm256_blendv_epi8(x4591,x,mask);
|
||||
return x;
|
||||
}
|
||||
|
||||
/* 24*8*float32 f inputs between -10000 and 10000 */
|
||||
/* 24*8*float32 g inputs between -32 and 32 */
|
||||
/* 48*8*float32 h outputs between -7680000 and 7680000 */
|
||||
static void mult24x8_float(__m256 h[48],const __m256 f[24],const __m256 g[24])
|
||||
{
|
||||
int i, j;
|
||||
__m256 f0, f1, f2, f3, f4, f5, gj, h0, h1, h2, h3, h4;
|
||||
|
||||
i = 0;
|
||||
f0 = f[i];
|
||||
f1 = f[i + 1];
|
||||
f2 = f[i + 2];
|
||||
f3 = f[i + 3];
|
||||
f4 = f[i + 4];
|
||||
f5 = f[i + 5];
|
||||
MULSTEP_fromzero(0,h0,h1,h2,h3,h4)
|
||||
for (j = 0;j < 20;j += 5) {
|
||||
MULSTEP_noload(j + 1,h1,h2,h3,h4,h0)
|
||||
MULSTEP_noload(j + 2,h2,h3,h4,h0,h1)
|
||||
MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
|
||||
MULSTEP_noload(j + 4,h4,h0,h1,h2,h3)
|
||||
MULSTEP_noload(j + 5,h0,h1,h2,h3,h4)
|
||||
}
|
||||
MULSTEP_noload(j + 1,h1,h2,h3,h4,h0)
|
||||
MULSTEP_noload(j + 2,h2,h3,h4,h0,h1)
|
||||
MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
|
||||
h[i + j + 4] = h4;
|
||||
h[i + j + 5] = h0;
|
||||
h[i + j + 6] = h1;
|
||||
h[i + j + 7] = h2;
|
||||
h[i + j + 8] = h3;
|
||||
|
||||
for (i = 6;i < 24;i += 6) {
|
||||
f0 = f[i];
|
||||
f1 = f[i + 1];
|
||||
f2 = f[i + 2];
|
||||
f3 = f[i + 3];
|
||||
f4 = f[i + 4];
|
||||
f5 = f[i + 5];
|
||||
h0 = h[i];
|
||||
h1 = h[i + 1];
|
||||
h2 = h[i + 2];
|
||||
h3 = h[i + 3];
|
||||
h4 = h[i + 4];
|
||||
for (j = 0;j < 15;j += 5) {
|
||||
MULSTEP(j + 0,h0,h1,h2,h3,h4)
|
||||
MULSTEP(j + 1,h1,h2,h3,h4,h0)
|
||||
MULSTEP(j + 2,h2,h3,h4,h0,h1)
|
||||
MULSTEP(j + 3,h3,h4,h0,h1,h2)
|
||||
MULSTEP(j + 4,h4,h0,h1,h2,h3)
|
||||
}
|
||||
MULSTEP(j + 0,h0,h1,h2,h3,h4)
|
||||
MULSTEP(j + 1,h1,h2,h3,h4,h0)
|
||||
MULSTEP(j + 2,h2,h3,h4,h0,h1)
|
||||
MULSTEP_noload(j + 3,h3,h4,h0,h1,h2)
|
||||
MULSTEP_noload(j + 4,h4,h0,h1,h2,h3)
|
||||
MULSTEP_noload(j + 5,h0,h1,h2,h3,h4)
|
||||
MULSTEP_noload(j + 6,h1,h2,h3,h4,h0)
|
||||
MULSTEP_noload(j + 7,h2,h3,h4,h0,h1)
|
||||
MULSTEP_noload(j + 8,h3,h4,h0,h1,h2)
|
||||
h[i + j + 9] = h4;
|
||||
h[i + j + 10] = h0;
|
||||
h[i + j + 11] = h1;
|
||||
h[i + j + 12] = h2;
|
||||
h[i + j + 13] = h3;
|
||||
}
|
||||
|
||||
h[47] = v0_float;
|
||||
}
|
||||
|
||||
/* 48*8*float32 f inputs between -5000 and 5000 */
|
||||
/* 48*8*float32 g inputs between -16 and 16 */
|
||||
/* 96*8*float32 h outputs between -3840000 and 3840000 */
|
||||
static void mult48x8_float(__m256 h[96],const __m256 f[48],const __m256 g[48])
|
||||
{
|
||||
__m256 h01[48];
|
||||
__m256 g01[24];
|
||||
__m256 *f01 = h01 + 24;
|
||||
int i;
|
||||
|
||||
for (i = 24;i > 0;) {
|
||||
i -= 2;
|
||||
f01[i] = f[i] + f[i + 24];
|
||||
g01[i] = g[i] + g[i + 24];
|
||||
f01[i + 1] = f[i + 1] + f[i + 1 + 24];
|
||||
g01[i + 1] = g[i + 1] + g[i + 1 + 24];
|
||||
}
|
||||
|
||||
mult24x8_float(h,f,g);
|
||||
mult24x8_float(h + 48,f + 24,g + 24);
|
||||
mult24x8_float(h01,f01,g01);
|
||||
|
||||
for (i = 0;i < 24;++i) {
|
||||
__m256 h0i = h[i];
|
||||
__m256 h0itop = h[i + 24];
|
||||
__m256 h1i = h[i + 48];
|
||||
__m256 h1itop = h[i + 72];
|
||||
__m256 h01i = h01[i];
|
||||
__m256 h01itop = h01[i + 24];
|
||||
__m256 c = fastsub(h0itop,h1i);
|
||||
h[i + 24] = c + fastsub(h01i,h0i);
|
||||
h[i + 48] = fastsub(h01itop,h1itop) - c;
|
||||
}
|
||||
}
|
||||
|
||||
/* 96*8*float32 f inputs between -2500 and 2500 */
|
||||
/* 96*8*float32 g inputs between -8 and 8 */
|
||||
/* 192*8*float32 h outputs between -1920000 and 1920000 */
|
||||
static void mult96x8_float(__m256 h[192],const __m256 f[96],const __m256 g[96])
|
||||
{
|
||||
__m256 h01[96];
|
||||
__m256 g01[48];
|
||||
__m256 *f01 = h01 + 48;
|
||||
int i;
|
||||
|
||||
for (i = 48;i > 0;) {
|
||||
i -= 4;
|
||||
f01[i] = f[i] + f[i + 48];
|
||||
g01[i] = g[i] + g[i + 48];
|
||||
f01[i + 1] = f[i + 1] + f[i + 1 + 48];
|
||||
g01[i + 1] = g[i + 1] + g[i + 1 + 48];
|
||||
f01[i + 2] = f[i + 2] + f[i + 2 + 48];
|
||||
g01[i + 2] = g[i + 2] + g[i + 2 + 48];
|
||||
f01[i + 3] = f[i + 3] + f[i + 3 + 48];
|
||||
g01[i + 3] = g[i + 3] + g[i + 3 + 48];
|
||||
}
|
||||
|
||||
mult48x8_float(h,f,g);
|
||||
mult48x8_float(h + 96,f + 48,g + 48);
|
||||
mult48x8_float(h01,f01,g01);
|
||||
|
||||
for (i = 0;i < 48;++i) {
|
||||
__m256 h0i = h[i];
|
||||
__m256 h0itop = h[i + 48];
|
||||
__m256 h1i = h[i + 96];
|
||||
__m256 h1itop = h[i + 144];
|
||||
__m256 h01i = h01[i];
|
||||
__m256 h01itop = h01[i + 48];
|
||||
__m256 c = fastsub(h0itop,h1i);
|
||||
h[i + 48] = c + fastsub(h01i,h0i);
|
||||
h[i + 96] = fastsub(h01itop,h1itop) - c;
|
||||
}
|
||||
}
|
||||
|
||||
/* 96*16*int16 f inputs between -2500 and 2500 */
|
||||
/* 96*(16*int8 stored in 32*int8) g inputs between -8 and 8 */
|
||||
/* 192*16*int16 h outputs between -2400 and 2400 */
|
||||
static void mult96x16(__m256i h[192],const __m256i f[96],const __m256i g[96])
|
||||
{
|
||||
__m256 hfloat[192];
|
||||
__m256 gfloat[96];
|
||||
__m256 *ffloat = hfloat + 96;
|
||||
int i, p;
|
||||
|
||||
for (p = 0;p < 2;++p) {
|
||||
for (i = 96;i > 0;) {
|
||||
i -= 2;
|
||||
__m256i fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i]));
|
||||
__m256i gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i]));
|
||||
__m256 storage;
|
||||
*(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32);
|
||||
ffloat[i] = storage - alpha_float;
|
||||
*(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32);
|
||||
gfloat[i] = storage - alpha_float;
|
||||
fi = _mm256_cvtepi16_epi32(_mm_loadu_si128(p + (const __m128i *) &f[i + 1]));
|
||||
gi = _mm256_cvtepi16_epi32(_mm_load_cvtepi8_epi16(p + (const long long *) &g[i + 1]));
|
||||
*(__m256i *) &storage = _mm256_add_epi32(fi,alpha_32);
|
||||
ffloat[i + 1] = storage - alpha_float;
|
||||
*(__m256i *) &storage = _mm256_add_epi32(gi,alpha_32);
|
||||
gfloat[i + 1] = storage - alpha_float;
|
||||
}
|
||||
mult96x8_float(hfloat,ffloat,gfloat);
|
||||
for (i = 192;i > 0;) {
|
||||
__m128i h0, h1;
|
||||
i -= 4;
|
||||
hfloat[i] = add(alpha_float,reduce(hfloat[i]));
|
||||
hfloat[i + 1] = fastadd(alpha_float,reduce(hfloat[i + 1]));
|
||||
hfloat[i + 2] = add(alpha_float,reduce(hfloat[i + 2]));
|
||||
hfloat[i + 3] = fastadd(alpha_float,reduce(hfloat[i + 3]));
|
||||
h0 = 0[(__m128i *) &hfloat[i]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
|
||||
h1 = 1[(__m128i *) &hfloat[i]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
|
||||
_mm_storeu_si128(p + (__m128i *) &h[i],_mm_packs_epi32(h0,h1));
|
||||
h0 = 0[(__m128i *) &hfloat[i + 1]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
|
||||
h1 = 1[(__m128i *) &hfloat[i + 1]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
|
||||
_mm_storeu_si128(p + (__m128i *) &h[i + 1],_mm_packs_epi32(h0,h1));
|
||||
h0 = 0[(__m128i *) &hfloat[i + 2]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
|
||||
h1 = 1[(__m128i *) &hfloat[i + 2]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
|
||||
_mm_storeu_si128(p + (__m128i *) &h[i + 2],_mm_packs_epi32(h0,h1));
|
||||
h0 = 0[(__m128i *) &hfloat[i + 3]]; h0 = _mm_sub_epi32(h0,alpha_32_128);
|
||||
h1 = 1[(__m128i *) &hfloat[i + 3]]; h1 = _mm_sub_epi32(h1,alpha_32_128);
|
||||
_mm_storeu_si128(p + (__m128i *) &h[i + 3],_mm_packs_epi32(h0,h1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* int16 i of output x[j] is int16 j of input x[i] */
|
||||
static void transpose16(__m256i x[16])
|
||||
{
|
||||
const static int rev[4] = {0,4,2,6};
|
||||
int i;
|
||||
__m256i y[16];
|
||||
|
||||
for (i = 0;i < 16;i += 4) {
|
||||
__m256i a0 = x[i];
|
||||
__m256i a1 = x[i + 1];
|
||||
__m256i a2 = x[i + 2];
|
||||
__m256i a3 = x[i + 3];
|
||||
__m256i b0 = _mm256_unpacklo_epi16(a0,a1);
|
||||
__m256i b1 = _mm256_unpackhi_epi16(a0,a1);
|
||||
__m256i b2 = _mm256_unpacklo_epi16(a2,a3);
|
||||
__m256i b3 = _mm256_unpackhi_epi16(a2,a3);
|
||||
__m256i c0 = _mm256_unpacklo_epi32(b0,b2);
|
||||
__m256i c2 = _mm256_unpackhi_epi32(b0,b2);
|
||||
__m256i c1 = _mm256_unpacklo_epi32(b1,b3);
|
||||
__m256i c3 = _mm256_unpackhi_epi32(b1,b3);
|
||||
y[i] = c0;
|
||||
y[i + 2] = c2;
|
||||
y[i + 1] = c1;
|
||||
y[i + 3] = c3;
|
||||
}
|
||||
for (i = 0;i < 4;++i) {
|
||||
int r = rev[i];
|
||||
__m256i c0 = y[i];
|
||||
__m256i c4 = y[i + 4];
|
||||
__m256i c8 = y[i + 8];
|
||||
__m256i c12 = y[i + 12];
|
||||
__m256i d0 = _mm256_unpacklo_epi64(c0,c4);
|
||||
__m256i d4 = _mm256_unpackhi_epi64(c0,c4);
|
||||
__m256i d8 = _mm256_unpacklo_epi64(c8,c12);
|
||||
__m256i d12 = _mm256_unpackhi_epi64(c8,c12);
|
||||
__m256i e0 = _mm256_permute2x128_si256(d0,d8,0x20);
|
||||
__m256i e8 = _mm256_permute2x128_si256(d0,d8,0x31);
|
||||
__m256i e4 = _mm256_permute2x128_si256(d4,d12,0x20);
|
||||
__m256i e12 = _mm256_permute2x128_si256(d4,d12,0x31);
|
||||
x[r] = e0;
|
||||
x[r + 8] = e8;
|
||||
x[r + 1] = e4;
|
||||
x[r + 9] = e12;
|
||||
}
|
||||
}
|
||||
|
||||
/* byte i of output x[j] is byte j of input x[i] */
|
||||
static void transpose32(__m256i x[32])
|
||||
{
|
||||
const static int rev[4] = {0,8,4,12};
|
||||
int i;
|
||||
__m256i y[32];
|
||||
|
||||
for (i = 0;i < 32;i += 4) {
|
||||
__m256i a0 = x[i];
|
||||
__m256i a1 = x[i + 1];
|
||||
__m256i a2 = x[i + 2];
|
||||
__m256i a3 = x[i + 3];
|
||||
__m256i b0 = _mm256_unpacklo_epi8(a0,a1);
|
||||
__m256i b1 = _mm256_unpackhi_epi8(a0,a1);
|
||||
__m256i b2 = _mm256_unpacklo_epi8(a2,a3);
|
||||
__m256i b3 = _mm256_unpackhi_epi8(a2,a3);
|
||||
__m256i c0 = _mm256_unpacklo_epi16(b0,b2);
|
||||
__m256i c2 = _mm256_unpackhi_epi16(b0,b2);
|
||||
__m256i c1 = _mm256_unpacklo_epi16(b1,b3);
|
||||
__m256i c3 = _mm256_unpackhi_epi16(b1,b3);
|
||||
y[i] = c0;
|
||||
y[i + 2] = c2;
|
||||
y[i + 1] = c1;
|
||||
y[i + 3] = c3;
|
||||
}
|
||||
for (i = 0;i < 4;++i) {
|
||||
int r = rev[i];
|
||||
__m256i c0 = y[i];
|
||||
__m256i c8 = y[i + 8];
|
||||
__m256i c16 = y[i + 16];
|
||||
__m256i c24 = y[i + 24];
|
||||
__m256i c4 = y[i + 4];
|
||||
__m256i c12 = y[i + 12];
|
||||
__m256i c20 = y[i + 20];
|
||||
__m256i c28 = y[i + 28];
|
||||
__m256i d0 = _mm256_unpacklo_epi32(c0,c4);
|
||||
__m256i d4 = _mm256_unpackhi_epi32(c0,c4);
|
||||
__m256i d8 = _mm256_unpacklo_epi32(c8,c12);
|
||||
__m256i d12 = _mm256_unpackhi_epi32(c8,c12);
|
||||
__m256i d16 = _mm256_unpacklo_epi32(c16,c20);
|
||||
__m256i d20 = _mm256_unpackhi_epi32(c16,c20);
|
||||
__m256i d24 = _mm256_unpacklo_epi32(c24,c28);
|
||||
__m256i d28 = _mm256_unpackhi_epi32(c24,c28);
|
||||
__m256i e0 = _mm256_unpacklo_epi64(d0,d8);
|
||||
__m256i e8 = _mm256_unpackhi_epi64(d0,d8);
|
||||
__m256i e16 = _mm256_unpacklo_epi64(d16,d24);
|
||||
__m256i e24 = _mm256_unpackhi_epi64(d16,d24);
|
||||
__m256i e4 = _mm256_unpacklo_epi64(d4,d12);
|
||||
__m256i e12 = _mm256_unpackhi_epi64(d4,d12);
|
||||
__m256i e20 = _mm256_unpacklo_epi64(d20,d28);
|
||||
__m256i e28 = _mm256_unpackhi_epi64(d20,d28);
|
||||
__m256i f0 = _mm256_permute2x128_si256(e0,e16,0x20);
|
||||
__m256i f16 = _mm256_permute2x128_si256(e0,e16,0x31);
|
||||
__m256i f8 = _mm256_permute2x128_si256(e8,e24,0x20);
|
||||
__m256i f24 = _mm256_permute2x128_si256(e8,e24,0x31);
|
||||
__m256i f4 = _mm256_permute2x128_si256(e4,e20,0x20);
|
||||
__m256i f20 = _mm256_permute2x128_si256(e4,e20,0x31);
|
||||
__m256i f12 = _mm256_permute2x128_si256(e12,e28,0x20);
|
||||
__m256i f28 = _mm256_permute2x128_si256(e12,e28,0x31);
|
||||
x[r] = f0;
|
||||
x[r + 16] = f16;
|
||||
x[r + 1] = f8;
|
||||
x[r + 17] = f24;
|
||||
x[r + 2] = f4;
|
||||
x[r + 18] = f20;
|
||||
x[r + 3] = f12;
|
||||
x[r + 19] = f28;
|
||||
}
|
||||
}
|
||||
|
||||
/* 48*16*int16 f inputs between -2295 and 2295 */
|
||||
/* 24*32*int8 g inputs between -1 and 1 */
|
||||
/* 96*16*int16 h outputs between -2295 and 2295 */
|
||||
static void mult768_mix2_m256i(__m256i h[96],const __m256i f[48],const __m256i g[24])
|
||||
{
|
||||
__m256i hkara[24][16];
|
||||
__m256i gkara[3][32];
|
||||
#define fkara hkara
|
||||
int i;
|
||||
|
||||
for (i = 6;i-- > 0;) {
|
||||
__m256i f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
__m256i f01, f23, f45, f67;
|
||||
__m256i f02, f46, f04, f26, f0426;
|
||||
__m256i f13, f57, f15, f37, f1537;
|
||||
__m256i f0213, f4657, f04261537, f0415, f2637;
|
||||
|
||||
f0 = _mm256_loadu_si256(&f[i + 0]);
|
||||
f1 = _mm256_loadu_si256(&f[i + 6]);
|
||||
f2 = _mm256_loadu_si256(&f[i + 12]);
|
||||
f3 = _mm256_loadu_si256(&f[i + 18]);
|
||||
f4 = _mm256_loadu_si256(&f[i + 24]);
|
||||
f5 = _mm256_loadu_si256(&f[i + 30]);
|
||||
f6 = _mm256_loadu_si256(&f[i + 36]);
|
||||
f7 = _mm256_loadu_si256(&f[i + 42]);
|
||||
f01 = squeezeadd16(f0,f1); fkara[i][8] = f01;
|
||||
f23 = squeezeadd16(f2,f3); fkara[i][9] = f23;
|
||||
f45 = squeezeadd16(f4,f5); fkara[i][10] = f45;
|
||||
f67 = squeezeadd16(f6,f7); fkara[i][11] = f67;
|
||||
|
||||
fkara[i][0] = f0;
|
||||
fkara[i][2] = f2;
|
||||
fkara[i][4] = f4;
|
||||
fkara[i][6] = f6;
|
||||
|
||||
f02 = squeezeadd16(f0,f2); fkara[i + 6][0] = f02;
|
||||
f04 = squeezeadd16(f0,f4); fkara[i + 6][6] = f04;
|
||||
f46 = squeezeadd16(f4,f6); fkara[i + 6][3] = f46;
|
||||
f26 = squeezeadd16(f2,f6); fkara[i + 6][8] = f26;
|
||||
|
||||
fkara[i][1] = f1;
|
||||
fkara[i][3] = f3;
|
||||
fkara[i][5] = f5;
|
||||
fkara[i][7] = f7;
|
||||
|
||||
f13 = squeezeadd16(f1,f3); fkara[i + 6][1] = f13;
|
||||
f15 = squeezeadd16(f1,f5); fkara[i + 6][7] = f15;
|
||||
f57 = squeezeadd16(f5,f7); fkara[i + 6][4] = f57;
|
||||
f37 = squeezeadd16(f3,f7); fkara[i + 6][9] = f37;
|
||||
|
||||
f0426 = squeezeadd16(f04,f26); fkara[i + 6][12] = f0426;
|
||||
f1537 = squeezeadd16(f15,f37); fkara[i + 6][13] = f1537;
|
||||
f0213 = squeezeadd16(f02,f13); fkara[i + 6][2] = f0213;
|
||||
f4657 = squeezeadd16(f46,f57); fkara[i + 6][5] = f4657;
|
||||
f0415 = squeezeadd16(f04,f15); fkara[i + 6][10] = f0415;
|
||||
f2637 = squeezeadd16(f26,f37); fkara[i + 6][11] = f2637;
|
||||
f04261537 = squeezeadd16(f0426,f1537); fkara[i + 6][14] = f04261537;
|
||||
|
||||
fkara[i][12] = v0;
|
||||
fkara[i][13] = v0;
|
||||
fkara[i][14] = v0;
|
||||
fkara[i][15] = v0;
|
||||
fkara[i + 6][15] = v0;
|
||||
}
|
||||
|
||||
for (i = 3;i-- > 0;) {
|
||||
__m256i g0, g1, g2, g3, g4, g5, g6, g7;
|
||||
__m256i g01, g23, g45, g67;
|
||||
__m256i g02, g46, g04, g26, g0426;
|
||||
__m256i g13, g57, g15, g37, g1537;
|
||||
__m256i g0213, g4657, g04261537, g0415, g2637;
|
||||
|
||||
g0 = _mm256_loadu_si256(&g[i + 0]);
|
||||
g1 = _mm256_loadu_si256(&g[i + 3]);
|
||||
g2 = _mm256_loadu_si256(&g[i + 6]);
|
||||
g3 = _mm256_loadu_si256(&g[i + 9]);
|
||||
g4 = _mm256_loadu_si256(&g[i + 12]);
|
||||
g5 = _mm256_loadu_si256(&g[i + 15]);
|
||||
g6 = _mm256_loadu_si256(&g[i + 18]);
|
||||
g7 = _mm256_loadu_si256(&g[i + 21]);
|
||||
g01 = _mm256_add_epi8(g0,g1); gkara[i][8] = g01;
|
||||
g23 = _mm256_add_epi8(g2,g3); gkara[i][9] = g23;
|
||||
g45 = _mm256_add_epi8(g4,g5); gkara[i][10] = g45;
|
||||
g67 = _mm256_add_epi8(g6,g7); gkara[i][11] = g67;
|
||||
|
||||
gkara[i][0] = g0;
|
||||
gkara[i][2] = g2;
|
||||
gkara[i][4] = g4;
|
||||
gkara[i][6] = g6;
|
||||
|
||||
g02 = _mm256_add_epi8(g0,g2); gkara[i][16] = g02;
|
||||
g04 = _mm256_add_epi8(g0,g4); gkara[i][22] = g04;
|
||||
g46 = _mm256_add_epi8(g4,g6); gkara[i][19] = g46;
|
||||
g26 = _mm256_add_epi8(g2,g6); gkara[i][24] = g26;
|
||||
|
||||
gkara[i][1] = g1;
|
||||
gkara[i][3] = g3;
|
||||
gkara[i][5] = g5;
|
||||
gkara[i][7] = g7;
|
||||
|
||||
g13 = _mm256_add_epi8(g1,g3); gkara[i][17] = g13;
|
||||
g15 = _mm256_add_epi8(g1,g5); gkara[i][23] = g15;
|
||||
g57 = _mm256_add_epi8(g5,g7); gkara[i][20] = g57;
|
||||
g37 = _mm256_add_epi8(g3,g7); gkara[i][25] = g37;
|
||||
|
||||
g0426 = _mm256_add_epi8(g04,g26); gkara[i][28] = g0426;
|
||||
g1537 = _mm256_add_epi8(g15,g37); gkara[i][29] = g1537;
|
||||
g0213 = _mm256_add_epi8(g02,g13); gkara[i][18] = g0213;
|
||||
g4657 = _mm256_add_epi8(g46,g57); gkara[i][21] = g4657;
|
||||
g0415 = _mm256_add_epi8(g04,g15); gkara[i][26] = g0415;
|
||||
g2637 = _mm256_add_epi8(g26,g37); gkara[i][27] = g2637;
|
||||
g04261537 = _mm256_add_epi8(g0426,g1537); gkara[i][30] = g04261537;
|
||||
|
||||
gkara[i][12] = v0;
|
||||
gkara[i][13] = v0;
|
||||
gkara[i][14] = v0;
|
||||
gkara[i][15] = v0;
|
||||
gkara[i][31] = v0;
|
||||
}
|
||||
|
||||
for (i = 12;i-- > 0;)
|
||||
transpose16(fkara[i]);
|
||||
for (i = 3;i-- > 0;)
|
||||
transpose32(gkara[i]);
|
||||
|
||||
mult96x16(hkara[12],fkara[6],(__m256i *) (1 + (__m128i *) gkara));
|
||||
mult96x16(hkara[0],fkara[0],gkara[0]);
|
||||
|
||||
for (i = 24;i-- > 0;)
|
||||
transpose16(hkara[i]);
|
||||
|
||||
for (i = 6;i-- > 0;) {
|
||||
__m256i h0,h1,h2,h3,h4,h5,h6,h7,h8,h9;
|
||||
__m256i h10,h11,h12,h13,h14,h15,h16,h17,h18,h19;
|
||||
__m256i h20,h21,h22,h23;
|
||||
__m256i h32,h33,h34,h35,h36,h37,h38,h39;
|
||||
__m256i h40,h41,h42,h43,h44,h45,h46,h47,h48,h49;
|
||||
__m256i h50,h51,h52,h53,h54,h55,h56,h57,h58,h59;
|
||||
__m256i h60,h61;
|
||||
__m256i c;
|
||||
|
||||
#define COMBINE(h0,h1,h2,h3,x0,x1) \
|
||||
c = _mm256_sub_epi16(h1,h2); \
|
||||
h1 = _mm256_sub_epi16(_mm256_add_epi16(c,x0),h0); \
|
||||
h2 = _mm256_sub_epi16(x1,_mm256_add_epi16(c,h3)); \
|
||||
h1 = squeeze(h1); \
|
||||
h2 = squeeze(h2);
|
||||
|
||||
h56 = hkara[i + 12][12];
|
||||
h57 = hkara[i + 18][12];
|
||||
h58 = hkara[i + 12][13];
|
||||
h59 = hkara[i + 18][13];
|
||||
h60 = hkara[i + 12][14];
|
||||
h61 = hkara[i + 18][14];
|
||||
COMBINE(h56,h57,h58,h59,h60,h61)
|
||||
|
||||
h44 = hkara[i + 12][6];
|
||||
h45 = hkara[i + 18][6];
|
||||
h46 = hkara[i + 12][7];
|
||||
h47 = hkara[i + 18][7];
|
||||
h52 = hkara[i + 12][10];
|
||||
h53 = hkara[i + 18][10];
|
||||
COMBINE(h44,h45,h46,h47,h52,h53)
|
||||
|
||||
h48 = hkara[i + 12][8];
|
||||
h49 = hkara[i + 18][8];
|
||||
h50 = hkara[i + 12][9];
|
||||
h51 = hkara[i + 18][9];
|
||||
h54 = hkara[i + 12][11];
|
||||
h55 = hkara[i + 18][11];
|
||||
COMBINE(h48,h49,h50,h51,h54,h55)
|
||||
COMBINE(h44,h46,h48,h50,h56,h58)
|
||||
COMBINE(h45,h47,h49,h51,h57,h59)
|
||||
|
||||
h0 = hkara[i][0];
|
||||
h1 = hkara[i + 6][0];
|
||||
h2 = hkara[i][1];
|
||||
h3 = hkara[i + 6][1];
|
||||
h16 = hkara[i][8];
|
||||
h17 = hkara[i + 6][8];
|
||||
COMBINE(h0,h1,h2,h3,h16,h17)
|
||||
|
||||
h4 = hkara[i][2];
|
||||
h5 = hkara[i + 6][2];
|
||||
h6 = hkara[i][3];
|
||||
h7 = hkara[i + 6][3];
|
||||
h18 = hkara[i][9];
|
||||
h19 = hkara[i + 6][9];
|
||||
COMBINE(h4,h5,h6,h7,h18,h19)
|
||||
|
||||
h32 = hkara[i + 12][0];
|
||||
h33 = hkara[i + 18][0];
|
||||
h34 = hkara[i + 12][1];
|
||||
h35 = hkara[i + 18][1];
|
||||
h36 = hkara[i + 12][2];
|
||||
h37 = hkara[i + 18][2];
|
||||
COMBINE(h32,h33,h34,h35,h36,h37)
|
||||
COMBINE(h1,h3,h5,h7,h33,h35)
|
||||
COMBINE(h0,h2,h4,h6,h32,h34)
|
||||
|
||||
h8 = hkara[i][4];
|
||||
h9 = hkara[i + 6][4];
|
||||
h10 = hkara[i][5];
|
||||
h11 = hkara[i + 6][5];
|
||||
h20 = hkara[i][10];
|
||||
h21 = hkara[i + 6][10];
|
||||
COMBINE(h8,h9,h10,h11,h20,h21)
|
||||
|
||||
h12 = hkara[i][6];
|
||||
h13 = hkara[i + 6][6];
|
||||
h14 = hkara[i][7];
|
||||
h15 = hkara[i + 6][7];
|
||||
h22 = hkara[i][11];
|
||||
h23 = hkara[i + 6][11];
|
||||
COMBINE(h12,h13,h14,h15,h22,h23)
|
||||
|
||||
h38 = hkara[i + 12][3];
|
||||
h39 = hkara[i + 18][3];
|
||||
h40 = hkara[i + 12][4];
|
||||
h41 = hkara[i + 18][4];
|
||||
h42 = hkara[i + 12][5];
|
||||
h43 = hkara[i + 18][5];
|
||||
COMBINE(h38,h39,h40,h41,h42,h43)
|
||||
COMBINE(h8,h10,h12,h14,h38,h40)
|
||||
COMBINE(h9,h11,h13,h15,h39,h41)
|
||||
|
||||
COMBINE(h0,h4,h8,h12,h44,h48)
|
||||
h0 = freeze(h0);
|
||||
h4 = freeze(h4);
|
||||
h8 = freeze(h8);
|
||||
h12 = freeze(h12);
|
||||
_mm256_storeu_si256(&h[i + 0],h0);
|
||||
_mm256_storeu_si256(&h[i + 24],h4);
|
||||
_mm256_storeu_si256(&h[i + 48],h8);
|
||||
_mm256_storeu_si256(&h[i + 72],h12);
|
||||
|
||||
COMBINE(h1,h5,h9,h13,h45,h49)
|
||||
h1 = freeze(h1);
|
||||
h5 = freeze(h5);
|
||||
h9 = freeze(h9);
|
||||
h13 = freeze(h13);
|
||||
_mm256_storeu_si256(&h[i + 6],h1);
|
||||
_mm256_storeu_si256(&h[i + 30],h5);
|
||||
_mm256_storeu_si256(&h[i + 54],h9);
|
||||
_mm256_storeu_si256(&h[i + 78],h13);
|
||||
|
||||
COMBINE(h2,h6,h10,h14,h46,h50)
|
||||
h2 = freeze(h2);
|
||||
h6 = freeze(h6);
|
||||
h10 = freeze(h10);
|
||||
h14 = freeze(h14);
|
||||
_mm256_storeu_si256(&h[i + 12],h2);
|
||||
_mm256_storeu_si256(&h[i + 36],h6);
|
||||
_mm256_storeu_si256(&h[i + 60],h10);
|
||||
_mm256_storeu_si256(&h[i + 84],h14);
|
||||
|
||||
COMBINE(h3,h7,h11,h15,h47,h51)
|
||||
h3 = freeze(h3);
|
||||
h7 = freeze(h7);
|
||||
h11 = freeze(h11);
|
||||
h15 = freeze(h15);
|
||||
_mm256_storeu_si256(&h[i + 18],h3);
|
||||
_mm256_storeu_si256(&h[i + 42],h7);
|
||||
_mm256_storeu_si256(&h[i + 66],h11);
|
||||
_mm256_storeu_si256(&h[i + 90],h15);
|
||||
}
|
||||
}
|
||||
|
||||
#define p 761
|
||||
|
||||
/* 761 f inputs between -2295 and 2295 */
|
||||
/* 761 g inputs between -1 and 1 */
|
||||
/* 761 h outputs between -2295 and 2295 */
|
||||
void rq_mult(modq *h,const modq *f,const small *g)
|
||||
{
|
||||
__m256i fgvec[96];
|
||||
modq *fg;
|
||||
int i;
|
||||
|
||||
mult768_mix2_m256i(fgvec,(__m256i *) f,(__m256i *) g);
|
||||
fg = (modq *) fgvec;
|
||||
|
||||
h[0] = modq_freeze(fg[0] + fg[p]);
|
||||
for (i = 1;i < 9;++i)
|
||||
h[i] = modq_freeze(fg[i] + fg[i + p - 1] + fg[i + p]);
|
||||
for (i = 9;i < 761;i += 16) {
|
||||
__m256i fgi = _mm256_loadu_si256((__m256i *) &fg[i]);
|
||||
__m256i fgip = _mm256_loadu_si256((__m256i *) &fg[i + p]);
|
||||
__m256i fgip1 = _mm256_loadu_si256((__m256i *) &fg[i + p - 1]);
|
||||
__m256i x = _mm256_add_epi16(fgi,_mm256_add_epi16(fgip,fgip1));
|
||||
x = freeze(squeeze(x));
|
||||
_mm256_storeu_si256((__m256i *) &h[i],x);
|
||||
}
|
||||
for (i = 761;i < 768;++i)
|
||||
h[i] = 0;
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
#ifndef params_h
|
||||
#define params_h
|
||||
|
||||
#define q 4591
|
||||
/* XXX: also built into modq in various ways */
|
||||
|
||||
#define qshift 2295
|
||||
#define p 761
|
||||
#define w 250
|
||||
|
||||
#define rq_encode_len 1218
|
||||
#define rq_encoderounded_len 1015
|
||||
#define small_encode_len 191
|
||||
|
||||
#endif
|
@ -1,29 +0,0 @@
|
||||
#include "params.h"
|
||||
#include "randombytes.h"
|
||||
#include "int32_sort.h"
|
||||
#include "small.h"
|
||||
#include "crypto_stream_aes256ctr.h"
|
||||
|
||||
static const unsigned char n[16] = {0};
|
||||
|
||||
void small_seeded_weightw(small *f,const unsigned char *k)
|
||||
{
|
||||
crypto_int32 r[768];
|
||||
int i;
|
||||
|
||||
crypto_stream_aes256ctr((unsigned char *) r,sizeof r,n,k);
|
||||
for (i = 0;i < p;++i) r[i] ^= 0x80000000;
|
||||
|
||||
for (i = 0;i < w;++i) r[i] &= -2;
|
||||
for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1;
|
||||
int32_sort(r,p);
|
||||
for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1;
|
||||
for (i = p;i < 768;++i) f[i] = 0;
|
||||
}
|
||||
|
||||
void small_random_weightw(small *f)
|
||||
{
|
||||
unsigned char k[32];
|
||||
randombytes(k,32);
|
||||
small_seeded_weightw(f,k);
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
#ifndef rq_h
|
||||
#define rq_h
|
||||
|
||||
#include "modq.h"
|
||||
#include "small.h"
|
||||
|
||||
#define rq_encode crypto_kem_ntrulpr4591761_avx_rq_encode
|
||||
extern void rq_encode(unsigned char *,const modq *);
|
||||
|
||||
#define rq_decode crypto_kem_ntrulpr4591761_avx_rq_decode
|
||||
extern void rq_decode(modq *,const unsigned char *);
|
||||
|
||||
#define rq_roundencode crypto_kem_ntrulpr4591761_avx_rq_roundencode
|
||||
extern void rq_roundencode(unsigned char *,const modq *);
|
||||
|
||||
#define rq_decoderounded crypto_kem_ntrulpr4591761_avx_rq_decoderounded
|
||||
extern void rq_decoderounded(modq *,const unsigned char *);
|
||||
|
||||
#define rq_round3 crypto_kem_ntrulpr4591761_avx_rq_round
|
||||
extern void rq_round3(modq *,const modq *);
|
||||
|
||||
#define rq_mult crypto_kem_ntrulpr4591761_avx_rq_mult
|
||||
extern void rq_mult(modq *,const modq *,const small *);
|
||||
|
||||
#define rq_recip3 crypto_kem_ntrulpr4591761_avx_rq_recip3
|
||||
int rq_recip3(modq *,const small *);
|
||||
|
||||
#define rq_fromseed crypto_kem_ntrulpr4591761_avx_rq_fromseed
|
||||
extern void rq_fromseed(modq *,const unsigned char *);
|
||||
|
||||
#define rq_top crypto_kem_ntrulpr4591761_avx_rq_top
|
||||
extern void rq_top(unsigned char *,const modq *,const unsigned char *);
|
||||
|
||||
#define rq_rightsubbit crypto_kem_ntrulpr4591761_avx_rq_rightsubbit
|
||||
extern void rq_rightsubbit(unsigned char *,const unsigned char *,const modq *);
|
||||
|
||||
#endif
|
@ -1,21 +0,0 @@
|
||||
#include "crypto_stream_aes256ctr.h"
|
||||
#include "rq.h"
|
||||
#include "params.h"
|
||||
|
||||
static const unsigned char n[16] = {0};
|
||||
|
||||
void rq_fromseed(modq *h,const unsigned char *K)
|
||||
{
|
||||
crypto_uint32 buf[768];
|
||||
int i;
|
||||
|
||||
crypto_stream_aes256ctr((unsigned char *) buf,sizeof buf,n,K);
|
||||
/* will use 761*4 bytes */
|
||||
/* convenient for aes to generate multiples of 16 bytes */
|
||||
/* and multiples of more for some implementations */
|
||||
|
||||
for (i = 0;i < p;++i)
|
||||
h[i] = modq_fromuint32(buf[i]);
|
||||
for (i = p;i < 768;++i)
|
||||
h[i] = 0;
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
#include "rq.h"
|
||||
#include "params.h"
|
||||
|
||||
void rq_rightsubbit(unsigned char *r,const unsigned char *c,const modq *ab)
|
||||
{
|
||||
modq t[256];
|
||||
int i;
|
||||
|
||||
for (i = 0;i < 128;++i) {
|
||||
crypto_uint32 x = c[i];
|
||||
t[2*i] = (x & 15) * 287 - 2007;
|
||||
t[2*i+1] = (x >> 4) * 287 - 2007;
|
||||
}
|
||||
|
||||
for (i = 0;i < 256;++i)
|
||||
t[i] = -(modq_freeze(t[i] - ab[i] + 4*w+1) >> 14);
|
||||
|
||||
for (i = 0;i < 32;++i) r[i] = 0;
|
||||
for (i = 0;i < 256;++i)
|
||||
r[i / 8] |= (t[i] << (i & 7));
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
#include "rq.h"
|
||||
|
||||
void rq_top(unsigned char *c,const modq *f,const unsigned char *r)
|
||||
{
|
||||
modq T[256];
|
||||
int i;
|
||||
|
||||
for (i = 0;i < 256;++i) {
|
||||
modq x = f[i];
|
||||
x = modq_sum(x,2295 * (1 & (r[i / 8] >> (i & 7))));
|
||||
x = ((x + 2156) * 114 + 16384) >> 15;
|
||||
T[i] = x; /* between 0 and 15 */
|
||||
}
|
||||
|
||||
for (i = 0;i < 128;++i)
|
||||
*c++ = T[2*i] + (T[2*i + 1] << 4);
|
||||
}
|
@ -1,44 +0,0 @@
|
||||
#include "params.h"
|
||||
#include "small.h"
|
||||
|
||||
/* XXX: these functions rely on p mod 4 = 1 */
|
||||
|
||||
/* all coefficients in -1, 0, 1 */
|
||||
void small_encode(unsigned char *c,const small *f)
|
||||
{
|
||||
small c0;
|
||||
int i;
|
||||
|
||||
for (i = 0;i < p/4;++i) {
|
||||
c0 = *f++ + 1;
|
||||
c0 += (*f++ + 1) << 2;
|
||||
c0 += (*f++ + 1) << 4;
|
||||
c0 += (*f++ + 1) << 6;
|
||||
*c++ = c0;
|
||||
}
|
||||
c0 = *f++ + 1;
|
||||
*c++ = c0;
|
||||
}
|
||||
|
||||
void small_decode(small *f,const unsigned char *c)
|
||||
{
|
||||
unsigned char c0;
|
||||
int i;
|
||||
|
||||
for (i = 0;i < p/4;++i) {
|
||||
c0 = *c++;
|
||||
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
|
||||
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
|
||||
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
|
||||
*f++ = ((small) (c0 & 3)) - 1;
|
||||
}
|
||||
c0 = *c++;
|
||||
*f++ = ((small) (c0 & 3)) - 1;
|
||||
*f++ = 0;
|
||||
*f++ = 0;
|
||||
*f++ = 0;
|
||||
*f++ = 0;
|
||||
*f++ = 0;
|
||||
*f++ = 0;
|
||||
*f++ = 0;
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
#ifndef small_h
|
||||
#define small_h
|
||||
|
||||
#include "crypto_int8.h"
|
||||
#include "crypto_int32.h"
|
||||
|
||||
typedef crypto_int8 small;
|
||||
|
||||
#define small_encode crypto_kem_ntrulpr4591761_avx_small_encode
|
||||
extern void small_encode(unsigned char *,const small *);
|
||||
|
||||
#define small_decode crypto_kem_ntrulpr4591761_avx_small_decode
|
||||
extern void small_decode(small *,const unsigned char *);
|
||||
|
||||
#define small_random32 crypto_kem_ntrulpr4591761_avx_small_random32
|
||||
extern crypto_int32 small_random32(void);
|
||||
|
||||
#define small_random crypto_kem_ntrulpr4591761_avx_small_random
|
||||
extern void small_random(small *);
|
||||
|
||||
#define small_seeded_weightw crypto_kem_ntrulpr4591761_avx_small_seeded_weightw
|
||||
extern void small_seeded_weightw(small *,const unsigned char *);
|
||||
|
||||
#define small_random_weightw crypto_kem_ntrulpr4591761_avx_small_random_weightw
|
||||
extern void small_random_weightw(small *);
|
||||
|
||||
#endif
|
@ -1 +0,0 @@
|
||||
bcc60c85ac6ca2dbbe244878ba9b62019560516e8377aecd890c737bf5dcb05f
|
@ -1 +0,0 @@
|
||||
a13b63e4929ab2ab97f7889f071245113ddd919bdaf1c883e12cd80fdf4f9e3e
|
@ -1 +0,0 @@
|
||||
Streamlined NTRU Prime 4591^761
|
@ -1,5 +0,0 @@
|
||||
Alphabetical order:
|
||||
Daniel J. Bernstein
|
||||
Chitchanok Chuengsatiansup
|
||||
Tanja Lange
|
||||
Christine van Vredendaal
|
@ -1,30 +0,0 @@
|
||||
This is a reference implementation of NTRU LPRime 4591^761. This
|
||||
implementation is designed primarily for clarity, subject to the
|
||||
following constraints:
|
||||
|
||||
* The implementation is written in C. We have a separate Sage
|
||||
implementation that is considerably more concise.
|
||||
|
||||
* The implementation avoids data-dependent branches and array
|
||||
indices. For example, conditional swaps are computed by arithmetic
|
||||
rather than by branches.
|
||||
|
||||
* The implementation avoids other C operations that often take
|
||||
variable time. For example, divisions by 3 are computed via
|
||||
multiplications and shifts.
|
||||
|
||||
This implementation does _not_ sacrifice clarity for speed.
|
||||
|
||||
This implementation has not yet been reviewed for correctness or for
|
||||
constant-time behavior. It does pass various tests and has no known
|
||||
bugs, but there are at least some platforms where multiplications take
|
||||
variable time, and fixing this requires platform-specific effort; see
|
||||
https://www.bearssl.org/ctmul.html and http://repository.tue.nl/800603.
|
||||
|
||||
This implementation allows "benign malleability" of ciphertexts, as
|
||||
defined in http://www.shoup.net/papers/iso-1_1.pdf. A similar comment
|
||||
applies to public keys.
|
||||
|
||||
There is a separate "avx" implementation where similar comments apply,
|
||||
except that "avx" _does_ sacrifice clarity for speed on CPUs with AVX2
|
||||
instructions.
|
@ -1,4 +0,0 @@
|
||||
#define CRYPTO_SECRETKEYBYTES 1238
|
||||
#define CRYPTO_PUBLICKEYBYTES 1047
|
||||
#define CRYPTO_CIPHERTEXTBYTES 1175
|
||||
#define CRYPTO_BYTES 32
|
@ -1,68 +0,0 @@
|
||||
#ifdef KAT
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#include "params.h"
|
||||
#include "small.h"
|
||||
#include "rq.h"
|
||||
#include "hide.h"
|
||||
#include "crypto_kem.h"
|
||||
|
||||
static int verify(const unsigned char *x,const unsigned char *y)
|
||||
{
|
||||
unsigned int differentbits = 0;
|
||||
int i;
|
||||
for (i = 0;i < crypto_kem_CIPHERTEXTBYTES;++i)
|
||||
differentbits |= x[i] ^ y[i];
|
||||
return (1 & ((differentbits - 1) >> 8)) - 1;
|
||||
}
|
||||
|
||||
int crypto_kem_dec(
|
||||
unsigned char *k,
|
||||
const unsigned char *cstr,
|
||||
const unsigned char *sk
|
||||
)
|
||||
{
|
||||
small a[p];
|
||||
modq B[p];
|
||||
modq aB[p];
|
||||
modq C[256];
|
||||
unsigned char r[32];
|
||||
unsigned char checkcstr[crypto_kem_CIPHERTEXTBYTES];
|
||||
unsigned char maybek[32];
|
||||
int i;
|
||||
int result;
|
||||
|
||||
small_decode(a,sk); sk += small_encode_len;
|
||||
rq_decoderounded(B,cstr + 32);
|
||||
rq_mult(aB,B,a);
|
||||
|
||||
for (i = 0;i < 128;++i) {
|
||||
crypto_uint32 x = cstr[32 + rq_encoderounded_len + i];
|
||||
C[2*i] = (x & 15) * 287 - 2007;
|
||||
C[2*i+1] = (x >> 4) * 287 - 2007;
|
||||
}
|
||||
|
||||
for (i = 0;i < 256;++i)
|
||||
C[i] = -(modq_freeze(C[i] - aB[i] + 4*w+1) >> 14);
|
||||
|
||||
for (i = 0;i < 32;++i) r[i] = 0;
|
||||
for (i = 0;i < 256;++i)
|
||||
r[i / 8] |= (C[i] << (i & 7));
|
||||
|
||||
#ifdef KAT
|
||||
{
|
||||
int j;
|
||||
printf("decrypt r: ");
|
||||
for (j = 0;j < 32;++j)
|
||||
printf("%02x",255 & (int) r[j]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
hide(checkcstr,maybek,sk,r);
|
||||
result = verify(cstr,checkcstr);
|
||||
|
||||
for (i = 0;i < 32;++i) k[i] = maybek[i] & ~result;
|
||||
return result;
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
#ifdef KAT
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#include "hide.h"
|
||||
#include "randombytes.h"
|
||||
#include "crypto_kem.h"
|
||||
|
||||
int crypto_kem_enc(
|
||||
unsigned char *cstr,
|
||||
unsigned char *k,
|
||||
const unsigned char *pk
|
||||
)
|
||||
{
|
||||
unsigned char r[32];
|
||||
randombytes(r,32);
|
||||
|
||||
#ifdef KAT
|
||||
{
|
||||
int i;
|
||||
printf("encrypt r: ");
|
||||
for (i = 0;i < 32;++i)
|
||||
printf("%02x",255 & (int) r[i]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
hide(cstr,k,pk,r);
|
||||
return 0;
|
||||
}
|
@ -1,49 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "crypto_hash_sha512.h"
|
||||
#include "crypto_kem.h"
|
||||
#include "params.h"
|
||||
#include "rq.h"
|
||||
#include "hide.h"
|
||||
|
||||
#if crypto_kem_CIPHERTEXTBYTES != rq_encoderounded_len + 32 + 128
|
||||
#error "crypto_kem_CIPHERTEXTBYTES must match rq_encoderounded_len + 32 + 128"
|
||||
#endif
|
||||
|
||||
void hide(unsigned char *cstr,unsigned char *k,const unsigned char *pk,const unsigned char *r)
|
||||
{
|
||||
modq G[p];
|
||||
modq A[p];
|
||||
unsigned char k12[64];
|
||||
unsigned char k34[64];
|
||||
small b[p];
|
||||
modq B[p];
|
||||
modq C[p];
|
||||
int i;
|
||||
|
||||
rq_fromseed(G,pk);
|
||||
rq_decoderounded(A,pk + 32);
|
||||
|
||||
crypto_hash_sha512(k12,r,32);
|
||||
small_seeded_weightw(b,k12);
|
||||
crypto_hash_sha512(k34,k12 + 32,32);
|
||||
|
||||
rq_mult(B,G,b);
|
||||
rq_round3(B,B);
|
||||
|
||||
rq_mult(C,A,b);
|
||||
for (i = 0;i < 256;++i) {
|
||||
modq x = C[i];
|
||||
x = modq_sum(x,2295 * (1 & (r[i / 8] >> (i & 7))));
|
||||
x = ((x + 2156) * 114 + 16384) >> 15;
|
||||
C[i] = x; /* between 0 and 15 */
|
||||
}
|
||||
|
||||
memcpy(cstr,k34,32); cstr += 32;
|
||||
memcpy(k,k34 + 32,32);
|
||||
|
||||
rq_encoderounded(cstr,B); cstr += rq_encoderounded_len;
|
||||
|
||||
for (i = 0;i < 128;++i)
|
||||
*cstr++ = C[2*i] + (C[2*i + 1] << 4);
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
#ifndef hide_h
|
||||
#define hide_h
|
||||
|
||||
#include "crypto_int32.h"
|
||||
|
||||
#define hide crypto_kem_ntrulpr4591761_ref_hide
|
||||
extern void hide(unsigned char *,unsigned char *,const unsigned char *,const unsigned char *);
|
||||
|
||||
#endif
|
@ -1,9 +0,0 @@
|
||||
#ifndef int32_sort_h
|
||||
#define int32_sort_h
|
||||
|
||||
#include "crypto_int32.h"
|
||||
|
||||
#define int32_sort crypto_kem_ntrulpr4591761_ref_int32_sort
|
||||
extern void int32_sort(crypto_int32 *,int);
|
||||
|
||||
#endif
|
@ -1,39 +0,0 @@
|
||||
#include <string.h>
|
||||
#include "modq.h"
|
||||
#include "params.h"
|
||||
#include "small.h"
|
||||
#include "rq.h"
|
||||
#include "crypto_kem.h"
|
||||
#include "randombytes.h"
|
||||
#include "crypto_stream_aes256ctr.h"
|
||||
|
||||
#if crypto_kem_PUBLICKEYBYTES != rq_encoderounded_len + 32
|
||||
#error "crypto_kem_PUBLICKEYBYTES must match rq_encoderounded_len + 32"
|
||||
#endif
|
||||
#if crypto_kem_SECRETKEYBYTES != small_encode_len + crypto_kem_PUBLICKEYBYTES
|
||||
#error "crypto_kem_SECRETKEYBYTES must match small_encode_len + crypto_kem_PUBLICKEYBYTES"
|
||||
#endif
|
||||
|
||||
int crypto_kem_keypair(unsigned char *pk,unsigned char *sk)
|
||||
{
|
||||
unsigned char K[32];
|
||||
modq G[p];
|
||||
small a[p];
|
||||
modq A[p];
|
||||
|
||||
randombytes(K,32);
|
||||
rq_fromseed(G,K);
|
||||
|
||||
small_random_weightw(a);
|
||||
|
||||
rq_mult(A,G,a);
|
||||
rq_round3(A,A);
|
||||
|
||||
memcpy(pk,K,32);
|
||||
rq_encoderounded(pk + 32,A);
|
||||
|
||||
small_encode(sk,a);
|
||||
memcpy(sk + small_encode_len,pk,crypto_kem_PUBLICKEYBYTES);
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,44 +0,0 @@
|
||||
#ifndef modq_h
|
||||
#define modq_h
|
||||
|
||||
#include "crypto_int16.h"
|
||||
#include "crypto_int32.h"
|
||||
#include "crypto_uint16.h"
|
||||
#include "crypto_uint32.h"
|
||||
|
||||
typedef crypto_int16 modq;
|
||||
|
||||
/* input between -9000000 and 9000000 */
|
||||
/* output between -2295 and 2295 */
|
||||
static inline modq modq_freeze(crypto_int32 a)
|
||||
{
|
||||
a -= 4591 * ((228 * a) >> 20);
|
||||
a -= 4591 * ((58470 * a + 134217728) >> 28);
|
||||
return a;
|
||||
}
|
||||
|
||||
/* input between 0 and 4294967295 */
|
||||
/* output = (input % 4591) - 2295 */
|
||||
static inline modq modq_fromuint32(crypto_uint32 a)
|
||||
{
|
||||
crypto_int32 r;
|
||||
r = (a & 524287) + (a >> 19) * 914; /* <= 8010861 */
|
||||
return modq_freeze(r - 2295);
|
||||
}
|
||||
|
||||
static inline modq modq_plusproduct(modq a,modq b,modq c)
|
||||
{
|
||||
crypto_int32 A = a;
|
||||
crypto_int32 B = b;
|
||||
crypto_int32 C = c;
|
||||
return modq_freeze(A + B * C);
|
||||
}
|
||||
|
||||
static inline modq modq_sum(modq a,modq b)
|
||||
{
|
||||
crypto_int32 A = a;
|
||||
crypto_int32 B = b;
|
||||
return modq_freeze(A + B);
|
||||
}
|
||||
|
||||
#endif
|
@ -1,15 +0,0 @@
|
||||
#ifndef params_h
|
||||
#define params_h
|
||||
|
||||
#define q 4591
|
||||
/* XXX: also built into modq in various ways */
|
||||
|
||||
#define qshift 2295
|
||||
#define p 761
|
||||
#define w 250
|
||||
|
||||
#define rq_encode_len 1218
|
||||
#define rq_encoderounded_len 1015
|
||||
#define small_encode_len 191
|
||||
|
||||
#endif
|
@ -1,28 +0,0 @@
|
||||
#include "params.h"
|
||||
#include "randombytes.h"
|
||||
#include "int32_sort.h"
|
||||
#include "small.h"
|
||||
#include "crypto_stream_aes256ctr.h"
|
||||
|
||||
static const unsigned char n[16] = {0};
|
||||
|
||||
void small_seeded_weightw(small *f,const unsigned char *k)
|
||||
{
|
||||
crypto_int32 r[p];
|
||||
int i;
|
||||
|
||||
crypto_stream_aes256ctr((unsigned char *) r,sizeof r,n,k);
|
||||
for (i = 0;i < p;++i) r[i] ^= 0x80000000;
|
||||
|
||||
for (i = 0;i < w;++i) r[i] &= -2;
|
||||
for (i = w;i < p;++i) r[i] = (r[i] & -3) | 1;
|
||||
int32_sort(r,p);
|
||||
for (i = 0;i < p;++i) f[i] = ((small) (r[i] & 3)) - 1;
|
||||
}
|
||||
|
||||
void small_random_weightw(small *f)
|
||||
{
|
||||
unsigned char k[32];
|
||||
randombytes(k,32);
|
||||
small_seeded_weightw(f,k);
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
#ifndef rq_h
|
||||
#define rq_h
|
||||
|
||||
#include "modq.h"
|
||||
#include "small.h"
|
||||
|
||||
#define rq_encode crypto_kem_ntrulpr4591761_ref_rq_encode
|
||||
extern void rq_encode(unsigned char *,const modq *);
|
||||
|
||||
#define rq_decode crypto_kem_ntrulpr4591761_ref_rq_decode
|
||||
extern void rq_decode(modq *,const unsigned char *);
|
||||
|
||||
#define rq_encoderounded crypto_kem_ntrulpr4591761_ref_rq_encoderounded
|
||||
extern void rq_encoderounded(unsigned char *,const modq *);
|
||||
|
||||
#define rq_decoderounded crypto_kem_ntrulpr4591761_ref_rq_decoderounded
|
||||
extern void rq_decoderounded(modq *,const unsigned char *);
|
||||
|
||||
#define rq_round3 crypto_kem_ntrulpr4591761_ref_rq_round
|
||||
extern void rq_round3(modq *,const modq *);
|
||||
|
||||
#define rq_mult crypto_kem_ntrulpr4591761_ref_rq_mult
|
||||
extern void rq_mult(modq *,const modq *,const small *);
|
||||
|
||||
#define rq_recip3 crypto_kem_ntrulpr4591761_ref_rq_recip3
|
||||
int rq_recip3(modq *,const small *);
|
||||
|
||||
#define rq_fromseed crypto_kem_ntrulpr4591761_ref_rq_fromseed
|
||||
extern void rq_fromseed(modq *,const unsigned char *);
|
||||
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue