make format crypto/

pull/787/head
Rick V 5 years ago
parent 842fe039bc
commit 595b15d538
No known key found for this signature in database
GPG Key ID: C0EDC8723FDC3465

@ -2,30 +2,30 @@
#ifndef blake2b_compress_avx2_H
#define blake2b_compress_avx2_H
#define LOADU128(p) _mm_loadu_si128((__m128i *) (p))
#define STOREU128(p, r) _mm_storeu_si128((__m128i *) (p), r)
#define LOADU128(p) _mm_loadu_si128((__m128i *)(p))
#define STOREU128(p, r) _mm_storeu_si128((__m128i *)(p), r)
#define LOAD(p) _mm256_load_si256((__m256i *) (p))
#define STORE(p, r) _mm256_store_si256((__m256i *) (p), r)
#define LOAD(p) _mm256_load_si256((__m256i *)(p))
#define STORE(p, r) _mm256_store_si256((__m256i *)(p), r)
#define LOADU(p) _mm256_loadu_si256((__m256i *) (p))
#define STOREU(p, r) _mm256_storeu_si256((__m256i *) (p), r)
#define LOADU(p) _mm256_loadu_si256((__m256i *)(p))
#define STOREU(p, r) _mm256_storeu_si256((__m256i *)(p), r)
static inline uint64_t
LOADU64(const void *p)
{
uint64_t v;
memcpy(&v, p, sizeof v);
return v;
uint64_t v;
memcpy(&v, p, sizeof v);
return v;
}
#define ROTATE16 \
_mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, \
3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
#define ROTATE16 \
_mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, \
4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
#define ROTATE24 \
_mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, \
4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
#define ROTATE24 \
_mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, \
5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
#define ADD(a, b) _mm256_add_epi64(a, b)
#define SUB(a, b) _mm256_sub_epi64(a, b)
@ -40,98 +40,104 @@ LOADU64(const void *p)
#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))
#define BLAKE2B_G1_V1(a, b, c, d, m) \
do { \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT32(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT24(b); \
} while (0)
do \
{ \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT32(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT24(b); \
} while(0)
#define BLAKE2B_G2_V1(a, b, c, d, m) \
do { \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT16(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT63(b); \
} while (0)
#define BLAKE2B_DIAG_V1(a, b, c, d) \
do { \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \
} while (0)
#define BLAKE2B_UNDIAG_V1(a, b, c, d) \
do { \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \
} while (0)
do \
{ \
a = ADD(a, m); \
a = ADD(a, b); \
d = XOR(d, a); \
d = ROT16(d); \
c = ADD(c, d); \
b = XOR(b, c); \
b = ROT63(b); \
} while(0)
#define BLAKE2B_DIAG_V1(a, b, c, d) \
do \
{ \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \
} while(0)
#define BLAKE2B_UNDIAG_V1(a, b, c, d) \
do \
{ \
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \
} while(0)
#include "blake2b-load-avx2.h"
#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) \
do { \
__m256i b0; \
BLAKE2B_LOAD_MSG_##r##_1(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_2(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_DIAG_V1(a, b, c, d); \
BLAKE2B_LOAD_MSG_##r##_3(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_4(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_UNDIAG_V1(a, b, c, d); \
} while (0)
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) \
do { \
BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
} while (0)
#define DECLARE_MESSAGE_WORDS(m) \
const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
__m256i t0, t1;
#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \
do { \
DECLARE_MESSAGE_WORDS(m) \
const __m256i iv0 = a; \
const __m256i iv1 = b; \
__m256i c = LOAD(&blake2b_IV[0]); \
__m256i d = \
XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
a = XOR(a, c); \
b = XOR(b, d); \
a = XOR(a, iv0); \
b = XOR(b, iv1); \
} while (0)
do \
{ \
__m256i b0; \
BLAKE2B_LOAD_MSG_##r##_1(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_2(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_DIAG_V1(a, b, c, d); \
BLAKE2B_LOAD_MSG_##r##_3(b0); \
BLAKE2B_G1_V1(a, b, c, d, b0); \
BLAKE2B_LOAD_MSG_##r##_4(b0); \
BLAKE2B_G2_V1(a, b, c, d, b0); \
BLAKE2B_UNDIAG_V1(a, b, c, d); \
} while(0)
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) \
do \
{ \
BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
} while(0)
#define DECLARE_MESSAGE_WORDS(m) \
const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
__m256i t0, t1;
#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \
do \
{ \
DECLARE_MESSAGE_WORDS(m) \
const __m256i iv0 = a; \
const __m256i iv1 = b; \
__m256i c = LOAD(&blake2b_IV[0]); \
__m256i d = XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
a = XOR(a, c); \
b = XOR(b, d); \
a = XOR(a, iv0); \
b = XOR(b, iv1); \
} while(0)
#endif

@ -2,102 +2,99 @@
#ifndef blake2b_compress_sse41_H
#define blake2b_compress_sse41_H
#define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r)
#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
#define _mm_roti_epi64(x, c) \
(-(c) == 32) \
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) \
? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) \
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
#define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
#define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#include "blake2b-load-sse41.h"
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
#endif

@ -2,102 +2,99 @@
#ifndef blake2b_compress_ssse3_H
#define blake2b_compress_ssse3_H
#define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r)
#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
#define STOREU(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
#define _mm_roti_epi64(x, c) \
(-(c) == 32) \
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) \
? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) \
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define _mm_roti_epi64(x, c) \
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
#define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
row4l = _mm_roti_epi64(row4l, -32); \
row4h = _mm_roti_epi64(row4h, -32); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
row2l = _mm_roti_epi64(row2l, -24); \
row2h = _mm_roti_epi64(row2h, -24);
#define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
\
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
row4l = _mm_xor_si128(row4l, row1l); \
row4h = _mm_xor_si128(row4h, row1h); \
\
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
row4l = _mm_roti_epi64(row4l, -16); \
row4h = _mm_roti_epi64(row4h, -16); \
\
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
row3l = _mm_add_epi64(row3l, row4l); \
row3h = _mm_add_epi64(row3h, row4h); \
\
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
row2l = _mm_xor_si128(row2l, row3l); \
row2h = _mm_xor_si128(row2h, row3h); \
\
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
row2l = _mm_roti_epi64(row2l, -63); \
row2h = _mm_roti_epi64(row2h, -63);
#define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
row4l = t1; \
row4h = t0;
#define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
row2l = t0; \
row2h = t1; \
\
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
t0 = row3l; \
row3l = row3h; \
row3h = t0; \
\
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
row4l = t1; \
row4h = t0;
#include "blake2b-load-sse2.h"
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
#define ROUND(r) \
LOAD_MSG_##r##_1(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_2(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
LOAD_MSG_##r##_3(b0, b1); \
G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
LOAD_MSG_##r##_4(b0, b1); \
G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
#endif

@ -1,340 +1,388 @@
#ifndef blake2b_load_avx2_H
#define blake2b_load_avx2_H
#define BLAKE2B_LOAD_MSG_0_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_3(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_3(b0) \
do { \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_1_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_1(b0) \
do { \
t0 = _mm256_alignr_epi8(m6, m5, 8); \
t1 = _mm256_unpackhi_epi64(m2, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m4, m0); \
t1 = _mm256_blend_epi32(m6, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_3(b0) \
do { \
t0 = _mm256_blend_epi32(m1, m5, 0x33); \
t1 = _mm256_unpackhi_epi64(m3, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_2_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m7, m3); \
t1 = _mm256_alignr_epi8(m2, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_1(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_unpackhi_epi64(m6, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m0); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_3(b0) \
do { \
t0 = _mm256_blend_epi32(m2, m1, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_3_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m3, m5); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_1(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m2); \
t1 = _mm256_unpacklo_epi64(m1, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_2(b0) \
do { \
t0 = _mm256_blend_epi32(m3, m0, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_3(b0) \
do { \
t0 = _mm256_blend_epi32(m5, m7, 0x33); \
t1 = _mm256_blend_epi32(m1, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_4_4(b0) \
do { \
t0 = _mm256_alignr_epi8(m6, m0, 8); \
t1 = _mm256_blend_epi32(m6, m4, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m1, m3); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m6, m5); \
t1 = _mm256_unpackhi_epi64(m5, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_3(b0) \
do { \
t0 = _mm256_blend_epi32(m3, m2, 0x33); \
t1 = _mm256_unpackhi_epi64(m7, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_5_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m6, m2); \
t1 = _mm256_blend_epi32(m4, m7, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_1(b0) \
do { \
t0 = _mm256_blend_epi32(m0, m6, 0x33); \
t1 = _mm256_unpacklo_epi64(m7, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_alignr_epi8(m5, m6, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_3(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m3); \
t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_6_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_blend_epi32(m5, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_1(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m6, m3); \
t1 = _mm256_blend_epi32(m1, m6, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_2(b0) \
do { \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpackhi_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_3(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_unpacklo_epi64(m4, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_7_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m2); \
t1 = _mm256_unpacklo_epi64(m3, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m3, m7); \
t1 = _mm256_alignr_epi8(m0, m5, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_alignr_epi8(m4, m1, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_3(b0) \
do { \
t0 = m6; \
t1 = _mm256_alignr_epi8(m5, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_8_4(b0) \
do { \
t0 = _mm256_blend_epi32(m3, m1, 0x33); \
t1 = m2; \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_unpackhi_epi64(m3, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m1, m2); \
t1 = _mm256_blend_epi32(m2, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_3(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_unpackhi_epi64(m1, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_9_4(b0) \
do { \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpacklo_epi64(m6, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_2(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_3(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_10_4(b0) \
do { \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_1(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_2(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_3(b0) \
do { \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_11_4(b0) \
do { \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while (0)
#define BLAKE2B_LOAD_MSG_0_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_3(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_0_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_3(b0) \
do \
{ \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_1_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_1(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m6, m5, 8); \
t1 = _mm256_unpackhi_epi64(m2, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m4, m0); \
t1 = _mm256_blend_epi32(m6, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m1, m5, 0x33); \
t1 = _mm256_unpackhi_epi64(m3, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_2_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m7, m3); \
t1 = _mm256_alignr_epi8(m2, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_1(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_unpackhi_epi64(m6, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m0); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m2, m1, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_3_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m3, m5); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_1(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m2); \
t1 = _mm256_unpacklo_epi64(m1, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_2(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m3, m0, 0x33); \
t1 = _mm256_blend_epi32(m7, m2, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m5, m7, 0x33); \
t1 = _mm256_blend_epi32(m1, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_4_4(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m6, m0, 8); \
t1 = _mm256_blend_epi32(m6, m4, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m1, m3); \
t1 = _mm256_unpacklo_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m6, m5); \
t1 = _mm256_unpackhi_epi64(m5, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_3(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m3, m2, 0x33); \
t1 = _mm256_unpackhi_epi64(m7, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_5_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m6, m2); \
t1 = _mm256_blend_epi32(m4, m7, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_1(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m0, m6, 0x33); \
t1 = _mm256_unpacklo_epi64(m7, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_alignr_epi8(m5, m6, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_3(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m3); \
t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_6_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m3, m1); \
t1 = _mm256_blend_epi32(m5, m1, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_1(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m6, m3); \
t1 = _mm256_blend_epi32(m1, m6, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_2(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpackhi_epi64(m0, m4); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_3(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m2, m7); \
t1 = _mm256_unpacklo_epi64(m4, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_7_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m2); \
t1 = _mm256_unpacklo_epi64(m3, m5); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m3, m7); \
t1 = _mm256_alignr_epi8(m0, m5, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_alignr_epi8(m4, m1, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_3(b0) \
do \
{ \
t0 = m6; \
t1 = _mm256_alignr_epi8(m5, m0, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_8_4(b0) \
do \
{ \
t0 = _mm256_blend_epi32(m3, m1, 0x33); \
t1 = m2; \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_unpackhi_epi64(m3, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m1, m2); \
t1 = _mm256_blend_epi32(m2, m3, 0x33); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_3(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m7, m4); \
t1 = _mm256_unpackhi_epi64(m1, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_9_4(b0) \
do \
{ \
t0 = _mm256_alignr_epi8(m7, m5, 8); \
t1 = _mm256_unpacklo_epi64(m6, m0); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m0, m1); \
t1 = _mm256_unpacklo_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_2(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m0, m1); \
t1 = _mm256_unpackhi_epi64(m2, m3); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_3(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m4, m5); \
t1 = _mm256_unpacklo_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_10_4(b0) \
do \
{ \
t0 = _mm256_unpackhi_epi64(m4, m5); \
t1 = _mm256_unpackhi_epi64(m6, m7); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_1(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m7, m2); \
t1 = _mm256_unpackhi_epi64(m4, m6); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_2(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m5, m4); \
t1 = _mm256_alignr_epi8(m3, m7, 8); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_3(b0) \
do \
{ \
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
t1 = _mm256_unpackhi_epi64(m5, m2); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#define BLAKE2B_LOAD_MSG_11_4(b0) \
do \
{ \
t0 = _mm256_unpacklo_epi64(m6, m1); \
t1 = _mm256_unpackhi_epi64(m3, m1); \
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
} while(0)
#endif

@ -16,149 +16,149 @@
#ifndef blake2b_load_sse2_H
#define blake2b_load_sse2_H
#define LOAD_MSG_0_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_0_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_0_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_0_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_1_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_1_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_1_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_1_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_2_1(b0, b1) \
b0 = _mm_set_epi64x(m12, m11); \
b1 = _mm_set_epi64x(m15, m5)
#define LOAD_MSG_2_2(b0, b1) \
b0 = _mm_set_epi64x(m0, m8); \
b1 = _mm_set_epi64x(m13, m2)
#define LOAD_MSG_2_3(b0, b1) \
b0 = _mm_set_epi64x(m3, m10); \
b1 = _mm_set_epi64x(m9, m7)
#define LOAD_MSG_2_4(b0, b1) \
b0 = _mm_set_epi64x(m6, m14); \
b1 = _mm_set_epi64x(m4, m1)
#define LOAD_MSG_3_1(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m13)
#define LOAD_MSG_3_2(b0, b1) \
b0 = _mm_set_epi64x(m1, m9); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_3_3(b0, b1) \
b0 = _mm_set_epi64x(m5, m2); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_3_4(b0, b1) \
b0 = _mm_set_epi64x(m10, m6); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_4_1(b0, b1) \
b0 = _mm_set_epi64x(m5, m9); \
b1 = _mm_set_epi64x(m10, m2)
#define LOAD_MSG_4_2(b0, b1) \
b0 = _mm_set_epi64x(m7, m0); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_4_3(b0, b1) \
b0 = _mm_set_epi64x(m11, m14); \
b1 = _mm_set_epi64x(m3, m6)
#define LOAD_MSG_4_4(b0, b1) \
b0 = _mm_set_epi64x(m12, m1); \
b1 = _mm_set_epi64x(m13, m8)
#define LOAD_MSG_5_1(b0, b1) \
b0 = _mm_set_epi64x(m6, m2); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_5_2(b0, b1) \
b0 = _mm_set_epi64x(m10, m12); \
b1 = _mm_set_epi64x(m3, m11)
#define LOAD_MSG_5_3(b0, b1) \
b0 = _mm_set_epi64x(m7, m4); \
b1 = _mm_set_epi64x(m1, m15)
#define LOAD_MSG_5_4(b0, b1) \
b0 = _mm_set_epi64x(m5, m13); \
b1 = _mm_set_epi64x(m9, m14)
#define LOAD_MSG_6_1(b0, b1) \
b0 = _mm_set_epi64x(m1, m12); \
b1 = _mm_set_epi64x(m4, m14)
#define LOAD_MSG_6_2(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m10, m13)
#define LOAD_MSG_6_3(b0, b1) \
b0 = _mm_set_epi64x(m6, m0); \
b1 = _mm_set_epi64x(m8, m9)
#define LOAD_MSG_6_4(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m2)
#define LOAD_MSG_7_1(b0, b1) \
b0 = _mm_set_epi64x(m7, m13); \
b1 = _mm_set_epi64x(m3, m12)
#define LOAD_MSG_7_2(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m9, m1)
#define LOAD_MSG_7_3(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m2, m8)
#define LOAD_MSG_7_4(b0, b1) \
b0 = _mm_set_epi64x(m4, m0); \
b1 = _mm_set_epi64x(m10, m6)
#define LOAD_MSG_8_1(b0, b1) \
b0 = _mm_set_epi64x(m14, m6); \
b1 = _mm_set_epi64x(m0, m11)
#define LOAD_MSG_8_2(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m8, m3)
#define LOAD_MSG_8_3(b0, b1) \
b0 = _mm_set_epi64x(m13, m12); \
b1 = _mm_set_epi64x(m10, m1)
#define LOAD_MSG_8_4(b0, b1) \
b0 = _mm_set_epi64x(m7, m2); \
b1 = _mm_set_epi64x(m5, m4)
#define LOAD_MSG_9_1(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m1, m7)
#define LOAD_MSG_9_2(b0, b1) \
b0 = _mm_set_epi64x(m4, m2); \
b1 = _mm_set_epi64x(m5, m6)
#define LOAD_MSG_9_3(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m13, m3)
#define LOAD_MSG_9_4(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m0, m12)
#define LOAD_MSG_10_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_10_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_10_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_10_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_11_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_11_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_11_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_11_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_0_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_0_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_0_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_0_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_1_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_1_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_1_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_1_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_2_1(b0, b1) \
b0 = _mm_set_epi64x(m12, m11); \
b1 = _mm_set_epi64x(m15, m5)
#define LOAD_MSG_2_2(b0, b1) \
b0 = _mm_set_epi64x(m0, m8); \
b1 = _mm_set_epi64x(m13, m2)
#define LOAD_MSG_2_3(b0, b1) \
b0 = _mm_set_epi64x(m3, m10); \
b1 = _mm_set_epi64x(m9, m7)
#define LOAD_MSG_2_4(b0, b1) \
b0 = _mm_set_epi64x(m6, m14); \
b1 = _mm_set_epi64x(m4, m1)
#define LOAD_MSG_3_1(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m13)
#define LOAD_MSG_3_2(b0, b1) \
b0 = _mm_set_epi64x(m1, m9); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_3_3(b0, b1) \
b0 = _mm_set_epi64x(m5, m2); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_3_4(b0, b1) \
b0 = _mm_set_epi64x(m10, m6); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_4_1(b0, b1) \
b0 = _mm_set_epi64x(m5, m9); \
b1 = _mm_set_epi64x(m10, m2)
#define LOAD_MSG_4_2(b0, b1) \
b0 = _mm_set_epi64x(m7, m0); \
b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_4_3(b0, b1) \
b0 = _mm_set_epi64x(m11, m14); \
b1 = _mm_set_epi64x(m3, m6)
#define LOAD_MSG_4_4(b0, b1) \
b0 = _mm_set_epi64x(m12, m1); \
b1 = _mm_set_epi64x(m13, m8)
#define LOAD_MSG_5_1(b0, b1) \
b0 = _mm_set_epi64x(m6, m2); \
b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_5_2(b0, b1) \
b0 = _mm_set_epi64x(m10, m12); \
b1 = _mm_set_epi64x(m3, m11)
#define LOAD_MSG_5_3(b0, b1) \
b0 = _mm_set_epi64x(m7, m4); \
b1 = _mm_set_epi64x(m1, m15)
#define LOAD_MSG_5_4(b0, b1) \
b0 = _mm_set_epi64x(m5, m13); \
b1 = _mm_set_epi64x(m9, m14)
#define LOAD_MSG_6_1(b0, b1) \
b0 = _mm_set_epi64x(m1, m12); \
b1 = _mm_set_epi64x(m4, m14)
#define LOAD_MSG_6_2(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m10, m13)
#define LOAD_MSG_6_3(b0, b1) \
b0 = _mm_set_epi64x(m6, m0); \
b1 = _mm_set_epi64x(m8, m9)
#define LOAD_MSG_6_4(b0, b1) \
b0 = _mm_set_epi64x(m3, m7); \
b1 = _mm_set_epi64x(m11, m2)
#define LOAD_MSG_7_1(b0, b1) \
b0 = _mm_set_epi64x(m7, m13); \
b1 = _mm_set_epi64x(m3, m12)
#define LOAD_MSG_7_2(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m9, m1)
#define LOAD_MSG_7_3(b0, b1) \
b0 = _mm_set_epi64x(m15, m5); \
b1 = _mm_set_epi64x(m2, m8)
#define LOAD_MSG_7_4(b0, b1) \
b0 = _mm_set_epi64x(m4, m0); \
b1 = _mm_set_epi64x(m10, m6)
#define LOAD_MSG_8_1(b0, b1) \
b0 = _mm_set_epi64x(m14, m6); \
b1 = _mm_set_epi64x(m0, m11)
#define LOAD_MSG_8_2(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m8, m3)
#define LOAD_MSG_8_3(b0, b1) \
b0 = _mm_set_epi64x(m13, m12); \
b1 = _mm_set_epi64x(m10, m1)
#define LOAD_MSG_8_4(b0, b1) \
b0 = _mm_set_epi64x(m7, m2); \
b1 = _mm_set_epi64x(m5, m4)
#define LOAD_MSG_9_1(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m1, m7)
#define LOAD_MSG_9_2(b0, b1) \
b0 = _mm_set_epi64x(m4, m2); \
b1 = _mm_set_epi64x(m5, m6)
#define LOAD_MSG_9_3(b0, b1) \
b0 = _mm_set_epi64x(m9, m15); \
b1 = _mm_set_epi64x(m13, m3)
#define LOAD_MSG_9_4(b0, b1) \
b0 = _mm_set_epi64x(m14, m11); \
b1 = _mm_set_epi64x(m0, m12)
#define LOAD_MSG_10_1(b0, b1) \
b0 = _mm_set_epi64x(m2, m0); \
b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_10_2(b0, b1) \
b0 = _mm_set_epi64x(m3, m1); \
b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_10_3(b0, b1) \
b0 = _mm_set_epi64x(m10, m8); \
b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_10_4(b0, b1) \
b0 = _mm_set_epi64x(m11, m9); \
b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_11_1(b0, b1) \
b0 = _mm_set_epi64x(m4, m14); \
b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_11_2(b0, b1) \
b0 = _mm_set_epi64x(m8, m10); \
b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_11_3(b0, b1) \
b0 = _mm_set_epi64x(m0, m1); \
b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_11_4(b0, b1) \
b0 = _mm_set_epi64x(m2, m12); \
b1 = _mm_set_epi64x(m3, m7)
#endif

@ -16,292 +16,340 @@
#ifndef blake2b_load_sse41_H
#define blake2b_load_sse41_H
#define LOAD_MSG_0_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_0_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_0_3(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_0_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_1_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while (0)
#define LOAD_MSG_1_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while (0)
#define LOAD_MSG_1_3(b0, b1) \
do { \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while (0)
#define LOAD_MSG_1_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while (0)
#define LOAD_MSG_2_1(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while (0)
#define LOAD_MSG_2_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while (0)
#define LOAD_MSG_2_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while (0)
#define LOAD_MSG_2_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while (0)
#define LOAD_MSG_3_1(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while (0)
#define LOAD_MSG_3_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_3_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while (0)
#define LOAD_MSG_3_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while (0)
#define LOAD_MSG_4_1(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while (0)
#define LOAD_MSG_4_2(b0, b1) \
do { \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while (0)
#define LOAD_MSG_4_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while (0)
#define LOAD_MSG_4_4(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while (0)
#define LOAD_MSG_5_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while (0)
#define LOAD_MSG_5_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while (0)
#define LOAD_MSG_5_3(b0, b1) \
do { \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while (0)
#define LOAD_MSG_5_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while (0)
#define LOAD_MSG_6_1(b0, b1) \
do { \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while (0)
#define LOAD_MSG_6_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while (0)
#define LOAD_MSG_6_3(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
} while (0)
#define LOAD_MSG_6_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while (0)
#define LOAD_MSG_7_1(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while (0)
#define LOAD_MSG_7_2(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while (0)
#define LOAD_MSG_7_3(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while (0)
#define LOAD_MSG_7_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while (0)
#define LOAD_MSG_8_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while (0)
#define LOAD_MSG_8_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while (0)
#define LOAD_MSG_8_3(b0, b1) \
do { \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while (0)
#define LOAD_MSG_8_4(b0, b1) \
do { \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while (0)
#define LOAD_MSG_9_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while (0)
#define LOAD_MSG_9_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while (0)
#define LOAD_MSG_9_3(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while (0)
#define LOAD_MSG_9_4(b0, b1) \
do { \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while (0)
#define LOAD_MSG_10_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_10_2(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while (0)
#define LOAD_MSG_10_3(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_10_4(b0, b1) \
do { \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while (0)
#define LOAD_MSG_11_1(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while (0)
#define LOAD_MSG_11_2(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while (0)
#define LOAD_MSG_11_3(b0, b1) \
do { \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while (0)
#define LOAD_MSG_11_4(b0, b1) \
do { \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while (0)
#define LOAD_MSG_0_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_0_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_1_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_1_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_1_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_1_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#define LOAD_MSG_2_1(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while(0)
#define LOAD_MSG_2_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while(0)
#define LOAD_MSG_2_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while(0)
#define LOAD_MSG_2_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while(0)
#define LOAD_MSG_3_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while(0)
#define LOAD_MSG_3_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_3_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_3_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_4_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while(0)
#define LOAD_MSG_4_2(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_4_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while(0)
#define LOAD_MSG_4_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while(0)
#define LOAD_MSG_5_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_5_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while(0)
#define LOAD_MSG_5_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while(0)
#define LOAD_MSG_5_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while(0)
#define LOAD_MSG_6_1(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while(0)
#define LOAD_MSG_6_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while(0)
#define LOAD_MSG_6_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
} while(0)
#define LOAD_MSG_6_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while(0)
#define LOAD_MSG_7_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while(0)
#define LOAD_MSG_7_2(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_7_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while(0)
#define LOAD_MSG_7_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while(0)
#define LOAD_MSG_8_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while(0)
#define LOAD_MSG_8_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while(0)
#define LOAD_MSG_8_3(b0, b1) \
do \
{ \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while(0)
#define LOAD_MSG_8_4(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while(0)
#define LOAD_MSG_9_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while(0)
#define LOAD_MSG_9_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while(0)
#define LOAD_MSG_9_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while(0)
#define LOAD_MSG_9_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while(0)
#define LOAD_MSG_10_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_10_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_11_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_11_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_11_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_11_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#endif

@ -24,15 +24,17 @@
#ifndef __amd64__
#ifdef __clang__
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
#else
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __target__("sse2")))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __target__("sse2")))
#endif
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi64_si128(long long __a)
{
return (__m128i){ __a, 0 };
return (__m128i){__a, 0};
}
#endif

@ -1,86 +1,89 @@
if (bytes > 0) {
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint8_t partialblock[64];
unsigned int i;
x_0 = _mm_loadu_si128((__m128i*) (x + 0));
x_1 = _mm_loadu_si128((__m128i*) (x + 4));
x_2 = _mm_loadu_si128((__m128i*) (x + 8));
x_3 = _mm_loadu_si128((__m128i*) (x + 12));
for (i = 0; i < ROUNDS; i += 2) {
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
}
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12)));
_mm_storeu_si128((__m128i*) (partialblock + 0), x_0);
_mm_storeu_si128((__m128i*) (partialblock + 16), x_1);
_mm_storeu_si128((__m128i*) (partialblock + 32), x_2);
_mm_storeu_si128((__m128i*) (partialblock + 48), x_3);
for (i = 0; i < bytes; i++) {
c[i] = m[i] ^ partialblock[i];
}
sodium_memzero(partialblock, sizeof partialblock);
if(bytes > 0)
{
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint8_t partialblock[64];
unsigned int i;
x_0 = _mm_loadu_si128((__m128i*)(x + 0));
x_1 = _mm_loadu_si128((__m128i*)(x + 4));
x_2 = _mm_loadu_si128((__m128i*)(x + 8));
x_3 = _mm_loadu_si128((__m128i*)(x + 12));
for(i = 0; i < ROUNDS; i += 2)
{
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
}
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12)));
_mm_storeu_si128((__m128i*)(partialblock + 0), x_0);
_mm_storeu_si128((__m128i*)(partialblock + 16), x_1);
_mm_storeu_si128((__m128i*)(partialblock + 32), x_2);
_mm_storeu_si128((__m128i*)(partialblock + 48), x_3);
for(i = 0; i < bytes; i++)
{
c[i] = m[i] ^ partialblock[i];
}
sodium_memzero(partialblock, sizeof partialblock);
}

@ -1,98 +1,101 @@
while (bytes >= 64) {
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12;
uint32_t in13;
int i;
x_0 = _mm_loadu_si128((__m128i*) (x + 0));
x_1 = _mm_loadu_si128((__m128i*) (x + 4));
x_2 = _mm_loadu_si128((__m128i*) (x + 8));
x_3 = _mm_loadu_si128((__m128i*) (x + 12));
for (i = 0; i < ROUNDS; i += 2) {
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
}
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12)));
x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*) (m + 0)));
x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*) (m + 16)));
x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*) (m + 32)));
x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*) (m + 48)));
_mm_storeu_si128((__m128i*) (c + 0), x_0);
_mm_storeu_si128((__m128i*) (c + 16), x_1);
_mm_storeu_si128((__m128i*) (c + 32), x_2);
_mm_storeu_si128((__m128i*) (c + 48), x_3);
in12 = x[12];
in13 = x[13];
in12++;
if (in12 == 0) {
in13++;
}
x[12] = in12;
x[13] = in13;
bytes -= 64;
c += 64;
m += 64;
while(bytes >= 64)
{
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
const __m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12;
uint32_t in13;
int i;
x_0 = _mm_loadu_si128((__m128i*)(x + 0));
x_1 = _mm_loadu_si128((__m128i*)(x + 4));
x_2 = _mm_loadu_si128((__m128i*)(x + 8));
x_3 = _mm_loadu_si128((__m128i*)(x + 12));
for(i = 0; i < ROUNDS; i += 2)
{
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
}
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12)));
x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*)(m + 0)));
x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*)(m + 16)));
x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*)(m + 32)));
x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*)(m + 48)));
_mm_storeu_si128((__m128i*)(c + 0), x_0);
_mm_storeu_si128((__m128i*)(c + 16), x_1);
_mm_storeu_si128((__m128i*)(c + 32), x_2);
_mm_storeu_si128((__m128i*)(c + 48), x_3);
in12 = x[12];
in13 = x[13];
in12++;
if(in12 == 0)
{
in13++;
}
x[12] = in12;
x[13] = in13;
bytes -= 64;
c += 64;
m += 64;
}

@ -1,174 +1,177 @@
#define VEC4_ROT(A, IMM) \
_mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
_mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
* 16) (better) */
#define VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) \
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot16); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 12); \
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot8); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 7)
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot16); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 12); \
x_##A = _mm_add_epi32(x_##A, x_##B); \
t_##A = _mm_xor_si128(x_##D, x_##A); \
x_##D = _mm_shuffle_epi8(t_##A, rot8); \
x_##C = _mm_add_epi32(x_##C, x_##D); \
t_##C = _mm_xor_si128(x_##B, x_##C); \
x_##B = VEC4_ROT(t_##C, 7)
#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
if (bytes >= 256) {
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
__m128i x_0 = _mm_set1_epi32(x[0]);
__m128i x_1 = _mm_set1_epi32(x[1]);
__m128i x_2 = _mm_set1_epi32(x[2]);
__m128i x_3 = _mm_set1_epi32(x[3]);
__m128i x_4 = _mm_set1_epi32(x[4]);
__m128i x_5 = _mm_set1_epi32(x[5]);
__m128i x_6 = _mm_set1_epi32(x[6]);
__m128i x_7 = _mm_set1_epi32(x[7]);
__m128i x_8 = _mm_set1_epi32(x[8]);
__m128i x_9 = _mm_set1_epi32(x[9]);
__m128i x_10 = _mm_set1_epi32(x[10]);
__m128i x_11 = _mm_set1_epi32(x[11]);
__m128i x_12;
__m128i x_13;
__m128i x_14 = _mm_set1_epi32(x[14]);
__m128i x_15 = _mm_set1_epi32(x[15]);
__m128i orig0 = x_0;
__m128i orig1 = x_1;
__m128i orig2 = x_2;
__m128i orig3 = x_3;
__m128i orig4 = x_4;
__m128i orig5 = x_5;
__m128i orig6 = x_6;
__m128i orig7 = x_7;
__m128i orig8 = x_8;
__m128i orig9 = x_9;
__m128i orig10 = x_10;
__m128i orig11 = x_11;
__m128i orig12;
__m128i orig13;
__m128i orig14 = x_14;
__m128i orig15 = x_15;
__m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
uint32_t in12, in13;
int i;
while (bytes >= 256) {
const __m128i addv12 = _mm_set_epi64x(1, 0);
const __m128i addv13 = _mm_set_epi64x(3, 2);
__m128i t12, t13;
uint64_t in1213;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
t12 = _mm_set1_epi64x(in1213);
t13 = _mm_set1_epi64x(in1213);
x_12 = _mm_add_epi64(addv12, t12);
x_13 = _mm_add_epi64(addv13, t13);
t12 = _mm_unpacklo_epi32(x_12, x_13);
t13 = _mm_unpackhi_epi32(x_12, x_13);
x_12 = _mm_unpacklo_epi32(t12, t13);
x_13 = _mm_unpackhi_epi32(t12, t13);
orig12 = x_12;
orig13 = x_13;
in1213 += 4;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for (i = 0; i < ROUNDS; i += 2) {
VEC4_QUARTERROUND(0, 4, 8, 12);
VEC4_QUARTERROUND(1, 5, 9, 13);
VEC4_QUARTERROUND(2, 6, 10, 14);
VEC4_QUARTERROUND(3, 7, 11, 15);
VEC4_QUARTERROUND(0, 5, 10, 15);
VEC4_QUARTERROUND(1, 6, 11, 12);
VEC4_QUARTERROUND(2, 7, 8, 13);
VEC4_QUARTERROUND(3, 4, 9, 14);
}
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
\
x_##A = _mm_add_epi32(x_##A, orig##A); \
x_##B = _mm_add_epi32(x_##B, orig##B); \
x_##C = _mm_add_epi32(x_##C, orig##C); \
x_##D = _mm_add_epi32(x_##D, orig##D); \
t_##A = _mm_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm_unpackhi_epi64(t_##C, t_##D); \
\
t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((__m128i*) (m + 0))); \
_mm_storeu_si128((__m128i*) (c + 0), t0); \
t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((__m128i*) (m + 64))); \
_mm_storeu_si128((__m128i*) (c + 64), t1); \
t2 = _mm_xor_si128(x_##C, _mm_loadu_si128((__m128i*) (m + 128))); \
_mm_storeu_si128((__m128i*) (c + 128), t2); \
t3 = _mm_xor_si128(x_##D, _mm_loadu_si128((__m128i*) (m + 192))); \
_mm_storeu_si128((__m128i*) (c + 192), t3); \
if(bytes >= 256)
{
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m128i rot16 =
_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m128i rot8 =
_mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
__m128i x_0 = _mm_set1_epi32(x[0]);
__m128i x_1 = _mm_set1_epi32(x[1]);
__m128i x_2 = _mm_set1_epi32(x[2]);
__m128i x_3 = _mm_set1_epi32(x[3]);
__m128i x_4 = _mm_set1_epi32(x[4]);
__m128i x_5 = _mm_set1_epi32(x[5]);
__m128i x_6 = _mm_set1_epi32(x[6]);
__m128i x_7 = _mm_set1_epi32(x[7]);
__m128i x_8 = _mm_set1_epi32(x[8]);
__m128i x_9 = _mm_set1_epi32(x[9]);
__m128i x_10 = _mm_set1_epi32(x[10]);
__m128i x_11 = _mm_set1_epi32(x[11]);
__m128i x_12;
__m128i x_13;
__m128i x_14 = _mm_set1_epi32(x[14]);
__m128i x_15 = _mm_set1_epi32(x[15]);
__m128i orig0 = x_0;
__m128i orig1 = x_1;
__m128i orig2 = x_2;
__m128i orig3 = x_3;
__m128i orig4 = x_4;
__m128i orig5 = x_5;
__m128i orig6 = x_6;
__m128i orig7 = x_7;
__m128i orig8 = x_8;
__m128i orig9 = x_9;
__m128i orig10 = x_10;
__m128i orig11 = x_11;
__m128i orig12;
__m128i orig13;
__m128i orig14 = x_14;
__m128i orig15 = x_15;
__m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
uint32_t in12, in13;
int i;
while(bytes >= 256)
{
const __m128i addv12 = _mm_set_epi64x(1, 0);
const __m128i addv13 = _mm_set_epi64x(3, 2);
__m128i t12, t13;
uint64_t in1213;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t)in12) | (((uint64_t)in13) << 32);
t12 = _mm_set1_epi64x(in1213);
t13 = _mm_set1_epi64x(in1213);
x_12 = _mm_add_epi64(addv12, t12);
x_13 = _mm_add_epi64(addv13, t13);
t12 = _mm_unpacklo_epi32(x_12, x_13);
t13 = _mm_unpackhi_epi32(x_12, x_13);
x_12 = _mm_unpacklo_epi32(t12, t13);
x_13 = _mm_unpackhi_epi32(t12, t13);
orig12 = x_12;
orig13 = x_13;
in1213 += 4;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for(i = 0; i < ROUNDS; i += 2)
{
VEC4_QUARTERROUND(0, 4, 8, 12);
VEC4_QUARTERROUND(1, 5, 9, 13);
VEC4_QUARTERROUND(2, 6, 10, 14);
VEC4_QUARTERROUND(3, 7, 11, 15);
VEC4_QUARTERROUND(0, 5, 10, 15);
VEC4_QUARTERROUND(1, 6, 11, 12);
VEC4_QUARTERROUND(2, 7, 8, 13);
VEC4_QUARTERROUND(3, 4, 9, 14);
}
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
\
x_##A = _mm_add_epi32(x_##A, orig##A); \
x_##B = _mm_add_epi32(x_##B, orig##B); \
x_##C = _mm_add_epi32(x_##C, orig##C); \
x_##D = _mm_add_epi32(x_##D, orig##D); \
t_##A = _mm_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm_unpackhi_epi64(t_##C, t_##D); \
\
t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((__m128i*)(m + 0))); \
_mm_storeu_si128((__m128i*)(c + 0), t0); \
t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((__m128i*)(m + 64))); \
_mm_storeu_si128((__m128i*)(c + 64), t1); \
t2 = _mm_xor_si128(x_##C, _mm_loadu_si128((__m128i*)(m + 128))); \
_mm_storeu_si128((__m128i*)(c + 128), t2); \
t3 = _mm_xor_si128(x_##D, _mm_loadu_si128((__m128i*)(m + 192))); \
_mm_storeu_si128((__m128i*)(c + 192), t3); \
}
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
ONEQUAD(0, 1, 2, 3);
m += 16;
c += 16;
ONEQUAD(4, 5, 6, 7);
m += 16;
c += 16;
ONEQUAD(8, 9, 10, 11);
m += 16;
c += 16;
ONEQUAD(12, 13, 14, 15);
m -= 48;
c -= 48;
ONEQUAD(0, 1, 2, 3);
m += 16;
c += 16;
ONEQUAD(4, 5, 6, 7);
m += 16;
c += 16;
ONEQUAD(8, 9, 10, 11);
m += 16;
c += 16;
ONEQUAD(12, 13, 14, 15);
m -= 48;
c -= 48;
#undef ONEQUAD
#undef ONEQUAD_TRANSPOSE
bytes -= 256;
c += 256;
m += 256;
}
bytes -= 256;
c += 256;
m += 256;
}
}
#undef VEC4_ROT
#undef VEC4_QUARTERROUND

@ -1,346 +1,344 @@
#define VEC8_ROT(A, IMM) \
_mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
_mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
/* implements a vector quarter round by-the-book (naive!) */
#define VEC8_QUARTERROUND_NAIVE(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = VEC8_ROT(t_##A, 8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
* 16) (better) */
#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
/* same, but replace 2 of the shift/shift/or "rotation" by byte & word shuffles
* (8 & 16) (not as good as previous) */
#define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
#define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 12); \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
t_##A = _mm256_xor_si256(x_##D, x_##A); \
x_##D = _mm256_shuffle_epi8(t_##A, rot8); \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
t_##C = _mm256_xor_si256(x_##B, x_##C); \
x_##B = VEC8_ROT(t_##C, 7)
#define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)
#define VEC8_LINE1(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
#define VEC8_LINE2(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
#define VEC8_LINE3(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
#define VEC8_LINE4(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
#define VEC8_LINE1(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
#define VEC8_LINE2(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
#define VEC8_LINE3(A, B, C, D) \
x_##A = _mm256_add_epi32(x_##A, x_##B); \
x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
#define VEC8_LINE4(A, B, C, D) \
x_##C = _mm256_add_epi32(x_##C, x_##D); \
x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
#define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \
C4, D4) \
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
#define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \
B4, C4, D4) \
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
#define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \
A4, B4, C4, D4) \
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
VEC8_LINE1(A1, B1, C1, D1); \
VEC8_LINE1(A2, B2, C2, D2); \
VEC8_LINE2(A1, B1, C1, D1); \
VEC8_LINE2(A2, B2, C2, D2); \
VEC8_LINE1(A3, B3, C3, D3); \
VEC8_LINE1(A4, B4, C4, D4); \
VEC8_LINE2(A3, B3, C3, D3); \
VEC8_LINE2(A4, B4, C4, D4); \
VEC8_LINE3(A1, B1, C1, D1); \
VEC8_LINE3(A2, B2, C2, D2); \
VEC8_LINE4(A1, B1, C1, D1); \
VEC8_LINE4(A2, B2, C2, D2); \
VEC8_LINE3(A3, B3, C3, D3); \
VEC8_LINE3(A4, B4, C4, D4); \
VEC8_LINE4(A3, B3, C3, D3); \
VEC8_LINE4(A4, B4, C4, D4)
#define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
D4) \
VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
D4)
if (bytes >= 512) {
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m256i rot16 =
_mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m256i rot8 =
_mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3,
14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12, in13;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i x_0 = _mm256_set1_epi32(x[0]);
__m256i x_1 = _mm256_set1_epi32(x[1]);
__m256i x_2 = _mm256_set1_epi32(x[2]);
__m256i x_3 = _mm256_set1_epi32(x[3]);
__m256i x_4 = _mm256_set1_epi32(x[4]);
__m256i x_5 = _mm256_set1_epi32(x[5]);
__m256i x_6 = _mm256_set1_epi32(x[6]);
__m256i x_7 = _mm256_set1_epi32(x[7]);
__m256i x_8 = _mm256_set1_epi32(x[8]);
__m256i x_9 = _mm256_set1_epi32(x[9]);
__m256i x_10 = _mm256_set1_epi32(x[10]);
__m256i x_11 = _mm256_set1_epi32(x[11]);
__m256i x_12;
__m256i x_13;
__m256i x_14 = _mm256_set1_epi32(x[14]);
__m256i x_15 = _mm256_set1_epi32(x[15]);
__m256i orig0 = x_0;
__m256i orig1 = x_1;
__m256i orig2 = x_2;
__m256i orig3 = x_3;
__m256i orig4 = x_4;
__m256i orig5 = x_5;
__m256i orig6 = x_6;
__m256i orig7 = x_7;
__m256i orig8 = x_8;
__m256i orig9 = x_9;
__m256i orig10 = x_10;
__m256i orig11 = x_11;
__m256i orig12;
__m256i orig13;
__m256i orig14 = x_14;
__m256i orig15 = x_15;
__m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
while (bytes >= 512) {
const __m256i addv12 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv13 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t12, t13;
uint64_t in1213;
int i;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
t12 = _mm256_add_epi64(addv12, x_12);
t13 = _mm256_add_epi64(addv13, x_13);
x_12 = _mm256_unpacklo_epi32(t12, t13);
x_13 = _mm256_unpackhi_epi32(t12, t13);
t12 = _mm256_unpacklo_epi32(x_12, x_13);
t13 = _mm256_unpackhi_epi32(x_12, x_13);
/* required because unpack* are intra-lane */
x_12 = _mm256_permutevar8x32_epi32(t12, permute);
x_13 = _mm256_permutevar8x32_epi32(t13, permute);
orig12 = x_12;
orig13 = x_13;
in1213 += 8;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for (i = 0; i < ROUNDS; i += 2) {
VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
}
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0), \
_mm_loadu_si128((__m128i*) (m + 0))); \
_mm_storeu_si128((__m128i*) (c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0), \
_mm_loadu_si128((__m128i*) (m + 64))); \
_mm_storeu_si128((__m128i*) (c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0), \
_mm_loadu_si128((__m128i*) (m + 128))); \
_mm_storeu_si128((__m128i*) (c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0), \
_mm_loadu_si128((__m128i*) (m + 192))); \
_mm_storeu_si128((__m128i*) (c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1), \
_mm_loadu_si128((__m128i*) (m + 256))); \
_mm_storeu_si128((__m128i*) (c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1), \
_mm_loadu_si128((__m128i*) (m + 320))); \
_mm_storeu_si128((__m128i*) (c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1), \
_mm_loadu_si128((__m128i*) (m + 384))); \
_mm_storeu_si128((__m128i*) (c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1), \
_mm_loadu_si128((__m128i*) (m + 448))); \
_mm_storeu_si128((__m128i*) (c + 448), t3); \
VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, D4)
if(bytes >= 512)
{
/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
__m256i rot16 =
_mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13,
12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
__m256i rot8 =
_mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14,
13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
uint32_t in12, in13;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i x_0 = _mm256_set1_epi32(x[0]);
__m256i x_1 = _mm256_set1_epi32(x[1]);
__m256i x_2 = _mm256_set1_epi32(x[2]);
__m256i x_3 = _mm256_set1_epi32(x[3]);
__m256i x_4 = _mm256_set1_epi32(x[4]);
__m256i x_5 = _mm256_set1_epi32(x[5]);
__m256i x_6 = _mm256_set1_epi32(x[6]);
__m256i x_7 = _mm256_set1_epi32(x[7]);
__m256i x_8 = _mm256_set1_epi32(x[8]);
__m256i x_9 = _mm256_set1_epi32(x[9]);
__m256i x_10 = _mm256_set1_epi32(x[10]);
__m256i x_11 = _mm256_set1_epi32(x[11]);
__m256i x_12;
__m256i x_13;
__m256i x_14 = _mm256_set1_epi32(x[14]);
__m256i x_15 = _mm256_set1_epi32(x[15]);
__m256i orig0 = x_0;
__m256i orig1 = x_1;
__m256i orig2 = x_2;
__m256i orig3 = x_3;
__m256i orig4 = x_4;
__m256i orig5 = x_5;
__m256i orig6 = x_6;
__m256i orig7 = x_7;
__m256i orig8 = x_8;
__m256i orig9 = x_9;
__m256i orig10 = x_10;
__m256i orig11 = x_11;
__m256i orig12;
__m256i orig13;
__m256i orig14 = x_14;
__m256i orig15 = x_15;
__m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
t_13, t_14, t_15;
while(bytes >= 512)
{
const __m256i addv12 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv13 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t12, t13;
uint64_t in1213;
int i;
x_0 = orig0;
x_1 = orig1;
x_2 = orig2;
x_3 = orig3;
x_4 = orig4;
x_5 = orig5;
x_6 = orig6;
x_7 = orig7;
x_8 = orig8;
x_9 = orig9;
x_10 = orig10;
x_11 = orig11;
x_14 = orig14;
x_15 = orig15;
in12 = x[12];
in13 = x[13];
in1213 = ((uint64_t)in12) | (((uint64_t)in13) << 32);
x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
t12 = _mm256_add_epi64(addv12, x_12);
t13 = _mm256_add_epi64(addv13, x_13);
x_12 = _mm256_unpacklo_epi32(t12, t13);
x_13 = _mm256_unpackhi_epi32(t12, t13);
t12 = _mm256_unpacklo_epi32(x_12, x_13);
t13 = _mm256_unpackhi_epi32(x_12, x_13);
/* required because unpack* are intra-lane */
x_12 = _mm256_permutevar8x32_epi32(t12, permute);
x_13 = _mm256_permutevar8x32_epi32(t13, permute);
orig12 = x_12;
orig13 = x_13;
in1213 += 8;
x[12] = in1213 & 0xFFFFFFFF;
x[13] = (in1213 >> 32) & 0xFFFFFFFF;
for(i = 0; i < ROUNDS; i += 2)
{
VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
}
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0), \
_mm_loadu_si128((__m128i*)(m + 0))); \
_mm_storeu_si128((__m128i*)(c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0), \
_mm_loadu_si128((__m128i*)(m + 64))); \
_mm_storeu_si128((__m128i*)(c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0), \
_mm_loadu_si128((__m128i*)(m + 128))); \
_mm_storeu_si128((__m128i*)(c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0), \
_mm_loadu_si128((__m128i*)(m + 192))); \
_mm_storeu_si128((__m128i*)(c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1), \
_mm_loadu_si128((__m128i*)(m + 256))); \
_mm_storeu_si128((__m128i*)(c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1), \
_mm_loadu_si128((__m128i*)(m + 320))); \
_mm_storeu_si128((__m128i*)(c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1), \
_mm_loadu_si128((__m128i*)(m + 384))); \
_mm_storeu_si128((__m128i*)(c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1), \
_mm_loadu_si128((__m128i*)(m + 448))); \
_mm_storeu_si128((__m128i*)(c + 448), t3); \
}
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
}
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
x_##A = _mm256_add_epi32(x_##A, orig##A); \
x_##B = _mm256_add_epi32(x_##B, orig##B); \
x_##C = _mm256_add_epi32(x_##C, orig##C); \
x_##D = _mm256_add_epi32(x_##D, orig##D); \
t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
}
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
{ \
ONEQUAD_UNPCK(A, B, C, D); \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
t_##A = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20); \
t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31); \
t_##B = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20); \
t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31); \
t_##C = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20); \
t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31); \
t_##D = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20); \
t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31); \
t_##A = \
_mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*) (m + 0))); \
t_##B = \
_mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*) (m + 64))); \
t_##C = \
_mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*) (m + 128))); \
t_##D = \
_mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*) (m + 192))); \
t_##A2 = _mm256_xor_si256(t_##A2, \
_mm256_loadu_si256((__m256i*) (m + 256))); \
t_##B2 = _mm256_xor_si256(t_##B2, \
_mm256_loadu_si256((__m256i*) (m + 320))); \
t_##C2 = _mm256_xor_si256(t_##C2, \
_mm256_loadu_si256((__m256i*) (m + 384))); \
t_##D2 = _mm256_xor_si256(t_##D2, \
_mm256_loadu_si256((__m256i*) (m + 448))); \
_mm256_storeu_si256((__m256i*) (c + 0), t_##A); \
_mm256_storeu_si256((__m256i*) (c + 64), t_##B); \
_mm256_storeu_si256((__m256i*) (c + 128), t_##C); \
_mm256_storeu_si256((__m256i*) (c + 192), t_##D); \
_mm256_storeu_si256((__m256i*) (c + 256), t_##A2); \
_mm256_storeu_si256((__m256i*) (c + 320), t_##B2); \
_mm256_storeu_si256((__m256i*) (c + 384), t_##C2); \
_mm256_storeu_si256((__m256i*) (c + 448), t_##D2); \
}
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
{ \
ONEQUAD_UNPCK(A, B, C, D); \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
t_##A = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20); \
t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31); \
t_##B = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20); \
t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31); \
t_##C = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20); \
t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31); \
t_##D = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20); \
t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31); \
t_##A = _mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*)(m + 0))); \
t_##B = _mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*)(m + 64))); \
t_##C = _mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*)(m + 128))); \
t_##D = _mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*)(m + 192))); \
t_##A2 = \
_mm256_xor_si256(t_##A2, _mm256_loadu_si256((__m256i*)(m + 256))); \
t_##B2 = \
_mm256_xor_si256(t_##B2, _mm256_loadu_si256((__m256i*)(m + 320))); \
t_##C2 = \
_mm256_xor_si256(t_##C2, _mm256_loadu_si256((__m256i*)(m + 384))); \
t_##D2 = \
_mm256_xor_si256(t_##D2, _mm256_loadu_si256((__m256i*)(m + 448))); \
_mm256_storeu_si256((__m256i*)(c + 0), t_##A); \
_mm256_storeu_si256((__m256i*)(c + 64), t_##B); \
_mm256_storeu_si256((__m256i*)(c + 128), t_##C); \
_mm256_storeu_si256((__m256i*)(c + 192), t_##D); \
_mm256_storeu_si256((__m256i*)(c + 256), t_##A2); \
_mm256_storeu_si256((__m256i*)(c + 320), t_##B2); \
_mm256_storeu_si256((__m256i*)(c + 384), t_##C2); \
_mm256_storeu_si256((__m256i*)(c + 448), t_##D2); \
}
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
#undef ONEQUAD
#undef ONEQUAD_TRANSPOSE
#undef ONEQUAD_UNPCK
#undef ONEOCTO
bytes -= 512;
c += 512;
m += 512;
}
bytes -= 512;
c += 512;
m += 512;
}
}
#undef VEC8_ROT
#undef VEC8_QUARTERROUND

@ -4,19 +4,18 @@
#include <stdint.h>
typedef struct crypto_stream_chacha20_implementation {
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_ietf)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
int (*stream_ietf_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k);
typedef struct crypto_stream_chacha20_implementation
{
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_ietf)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
int (*stream_ietf_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint32_t ic, const unsigned char *k);
} crypto_stream_chacha20_implementation;
#endif

@ -74,8 +74,8 @@ randombytes_stir(void)
try
{
var window_ = 'object' == = typeof window ? window : self;
var crypto_ = typeof window_.crypto != = 'undefined' ? window_.crypto
: window_.msCrypto;
var crypto_ = typeof window_.crypto != =
'undefined' ? window_.crypto : window_.msCrypto;
var randomValuesStandard = function()
{
var buf = new Uint32Array(1);

@ -69,7 +69,7 @@ typedef NTSTATUS(FAR PASCAL *CNGAPI_DRBG)(BCRYPT_ALG_HANDLE, UCHAR *, ULONG,
#endif
#ifndef TLS
#ifdef _WIN32
#ifdef _WIN32
#ifdef _MSC_VER
#define TLS __declspec(thread)
#else

File diff suppressed because it is too large Load Diff

@ -1,40 +1,52 @@
{
{ 25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626, -11754271, -6079156, 2047605 },
{ -12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692, 5043384, 19500929, -15469378 },
{ -8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919, 11864899, -24514362, -4438546 }
},
{
{ 15636291, -9688557, 24204773, -7912398, 616977, -16685262, 27787600, -14772189, 28944400, -1550024 },
{ 16568933, 4717097, -11556148, -1102322, 15682896, -11807043, 16354577, -11775962, 7689662, 11199574 },
{ 30464156, -5976125, -11779434, -15670865, 23220365, 15915852, 7512774, 10017326, -17749093, -9920357 }
},
{
{ 10861363, 11473154, 27284546, 1981175, -30064349, 12577861, 32867885, 14515107, -15438304, 10819380 },
{ 4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668, 12483688, -12668491, 5581306 },
{ 19563160, 16186464, -29386857, 4097519, 10237984, -4348115, 28542350, 13850243, -23678021, -15815942 }
},
{
{ 5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852, 5230134, -23952439, -15175766 },
{ -30269007, -3463509, 7665486, 10083793, 28475525, 1649722, 20654025, 16520125, 30598449, 7715701 },
{ 28881845, 14381568, 9657904, 3680757, -20181635, 7843316, -31400660, 1370708, 29794553, -1409300 }
},
{
{ -22518993, -6692182, 14201702, -8745502, -23510406, 8844726, 18474211, -1361450, -13062696, 13821877 },
{ -6455177, -7839871, 3374702, -4740862, -27098617, -10571707, 31655028, -7212327, 18853322, -14220951 },
{ 4566830, -12963868, -28974889, -12240689, -7602672, -2830569, -8514358, -10431137, 2207753, -3209784 }
},
{
{ -25154831, -4185821, 29681144, 7868801, -6854661, -9423865, -12437364, -663000, -31111463, -16132436 },
{ 25576264, -2703214, 7349804, -11814844, 16472782, 9300885, 3844789, 15725684, 171356, 6466918 },
{ 23103977, 13316479, 9739013, -16149481, 817875, -15038942, 8965339, -14088058, -30714912, 16193877 }
},
{
{ -33521811, 3180713, -2394130, 14003687, -16903474, -16270840, 17238398, 4729455, -18074513, 9256800 },
{ -25182317, -4174131, 32336398, 5036987, -21236817, 11360617, 22616405, 9761698, -19827198, 630305 },
{ -13720693, 2639453, -24237460, -7406481, 9494427, -5774029, -6554551, -15960994, -2449256, -14291300 }
},
{
{ -3151181, -5046075, 9282714, 6866145, -31907062, -863023, -18940575, 15033784, 25105118, -7894876 },
{ -24326370, 15950226, -31801215, -14592823, -11662737, -5090925, 1573892, -2625887, 2198790, -15804619 },
{ -3099351, 10324967, -2241613, 7453183, -5446979, -2735503, -13812022, -16236442, -32461234, -12290683 }
{{25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626,
-11754271, -6079156, 2047605},
{-12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692, 5043384,
19500929, -15469378},
{-8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919,
11864899, -24514362, -4438546}},
{{15636291, -9688557, 24204773, -7912398, 616977, -16685262, 27787600,
-14772189, 28944400, -1550024},
{16568933, 4717097, -11556148, -1102322, 15682896, -11807043, 16354577,
-11775962, 7689662, 11199574},
{30464156, -5976125, -11779434, -15670865, 23220365, 15915852, 7512774,
10017326, -17749093, -9920357}},
{{10861363, 11473154, 27284546, 1981175, -30064349, 12577861, 32867885,
14515107, -15438304, 10819380},
{4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668,
12483688, -12668491, 5581306},
{19563160, 16186464, -29386857, 4097519, 10237984, -4348115, 28542350,
13850243, -23678021, -15815942}},
{{5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852, 5230134,
-23952439, -15175766},
{-30269007, -3463509, 7665486, 10083793, 28475525, 1649722, 20654025,
16520125, 30598449, 7715701},
{28881845, 14381568, 9657904, 3680757, -20181635, 7843316, -31400660,
1370708, 29794553, -1409300}},
{{-22518993, -6692182, 14201702, -8745502, -23510406, 8844726, 18474211,
-1361450, -13062696, 13821877},
{-6455177, -7839871, 3374702, -4740862, -27098617, -10571707, 31655028,
-7212327, 18853322, -14220951},
{4566830, -12963868, -28974889, -12240689, -7602672, -2830569, -8514358,
-10431137, 2207753, -3209784}},
{{-25154831, -4185821, 29681144, 7868801, -6854661, -9423865, -12437364,
-663000, -31111463, -16132436},
{25576264, -2703214, 7349804, -11814844, 16472782, 9300885, 3844789,
15725684, 171356, 6466918},
{23103977, 13316479, 9739013, -16149481, 817875, -15038942, 8965339,
-14088058, -30714912, 16193877}},
{{-33521811, 3180713, -2394130, 14003687, -16903474, -16270840, 17238398,
4729455, -18074513, 9256800},
{-25182317, -4174131, 32336398, 5036987, -21236817, 11360617, 22616405,
9761698, -19827198, 630305},
{-13720693, 2639453, -24237460, -7406481, 9494427, -5774029, -6554551,
-15960994, -2449256, -14291300}},
{
{-3151181, -5046075, 9282714, 6866145, -31907062,
-863023, -18940575, 15033784, 25105118, -7894876},
{-24326370, 15950226, -31801215, -14592823, -11662737,
-5090925, 1573892, -2625887, 2198790, -15804619},
{
-3099351, 10324967, -2241613, 7453183, -5446979, -2735503, -13812022,
-16236442, -32461234, -12290683
}
}

@ -1,20 +1,18 @@
/* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */
static const fe25519 d = {
-10913610, 13857413, -15372611, 6949391, 114729, -8787816, -6275908, -3247719, -18696448, -12055116
};
/* 37095705934669439343138083508754565189542113879843219016388785533085940283555
*/
static const fe25519 d = {-10913610, 13857413, -15372611, 6949391, 114729,
-8787816, -6275908, -3247719, -18696448, -12055116};
/* 2 * d =
* 16295367250680780974490674513165176452449235426866156013048779062215315747161
*/
static const fe25519 d2 = {
-21827239, -5839606, -30745221, 13898782, 229458, 15978800, -12551817, -6495438, 29715968, 9444199 };
static const fe25519 d2 = {-21827239, -5839606, -30745221, 13898782, 229458,
15978800, -12551817, -6495438, 29715968, 9444199};
/* sqrt(-1) */
static const fe25519 sqrtm1 = {
-32595792, -7943725, 9377950, 3500415, 12389472, -272473, -25146209, -2005654, 326686, 11406482
};
static const fe25519 sqrtm1 = {-32595792, -7943725, 9377950, 3500415,
12389472, -272473, -25146209, -2005654,
326686, 11406482};
/* A = 486662 */
static const fe25519 curve25519_A = {
486662, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
static const fe25519 curve25519_A = {486662, 0, 0, 0, 0, 0, 0, 0, 0, 0};

@ -5,70 +5,70 @@
void
fe25519_frombytes(fe25519 h, const unsigned char *s)
{
int64_t h0 = load_4(s);
int64_t h1 = load_3(s + 4) << 6;
int64_t h2 = load_3(s + 7) << 5;
int64_t h3 = load_3(s + 10) << 3;
int64_t h4 = load_3(s + 13) << 2;
int64_t h5 = load_4(s + 16);
int64_t h6 = load_3(s + 20) << 7;
int64_t h7 = load_3(s + 23) << 5;
int64_t h8 = load_3(s + 26) << 4;
int64_t h9 = (load_3(s + 29) & 8388607) << 2;
int64_t carry0;
int64_t carry1;
int64_t carry2;
int64_t carry3;
int64_t carry4;
int64_t carry5;
int64_t carry6;
int64_t carry7;
int64_t carry8;
int64_t carry9;
carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
h0 += carry9 * 19;
h9 -= carry9 * ((uint64_t) 1L << 25);
carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
h2 += carry1;
h1 -= carry1 * ((uint64_t) 1L << 25);
carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
h4 += carry3;
h3 -= carry3 * ((uint64_t) 1L << 25);
carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
h6 += carry5;
h5 -= carry5 * ((uint64_t) 1L << 25);
carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
h8 += carry7;
h7 -= carry7 * ((uint64_t) 1L << 25);
carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
h1 += carry0;
h0 -= carry0 * ((uint64_t) 1L << 26);
carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
h3 += carry2;
h2 -= carry2 * ((uint64_t) 1L << 26);
carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
h5 += carry4;
h4 -= carry4 * ((uint64_t) 1L << 26);
carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
h7 += carry6;
h6 -= carry6 * ((uint64_t) 1L << 26);
carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
h9 += carry8;
h8 -= carry8 * ((uint64_t) 1L << 26);
h[0] = (int32_t) h0;
h[1] = (int32_t) h1;
h[2] = (int32_t) h2;
h[3] = (int32_t) h3;
h[4] = (int32_t) h4;
h[5] = (int32_t) h5;
h[6] = (int32_t) h6;
h[7] = (int32_t) h7;
h[8] = (int32_t) h8;
h[9] = (int32_t) h9;
int64_t h0 = load_4(s);
int64_t h1 = load_3(s + 4) << 6;
int64_t h2 = load_3(s + 7) << 5;
int64_t h3 = load_3(s + 10) << 3;
int64_t h4 = load_3(s + 13) << 2;
int64_t h5 = load_4(s + 16);
int64_t h6 = load_3(s + 20) << 7;
int64_t h7 = load_3(s + 23) << 5;
int64_t h8 = load_3(s + 26) << 4;
int64_t h9 = (load_3(s + 29) & 8388607) << 2;
int64_t carry0;
int64_t carry1;
int64_t carry2;
int64_t carry3;
int64_t carry4;
int64_t carry5;
int64_t carry6;
int64_t carry7;
int64_t carry8;
int64_t carry9;
carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
h0 += carry9 * 19;
h9 -= carry9 * ((uint64_t)1L << 25);
carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
h2 += carry1;
h1 -= carry1 * ((uint64_t)1L << 25);
carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
h4 += carry3;
h3 -= carry3 * ((uint64_t)1L << 25);
carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
h6 += carry5;
h5 -= carry5 * ((uint64_t)1L << 25);
carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
h8 += carry7;
h7 -= carry7 * ((uint64_t)1L << 25);
carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
h1 += carry0;
h0 -= carry0 * ((uint64_t)1L << 26);
carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
h3 += carry2;
h2 -= carry2 * ((uint64_t)1L << 26);
carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
h5 += carry4;
h4 -= carry4 * ((uint64_t)1L << 26);
carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
h7 += carry6;
h6 -= carry6 * ((uint64_t)1L << 26);
carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
h9 += carry8;
h8 -= carry8 * ((uint64_t)1L << 26);
h[0] = (int32_t)h0;
h[1] = (int32_t)h1;
h[2] = (int32_t)h2;
h[3] = (int32_t)h3;
h[4] = (int32_t)h4;
h[5] = (int32_t)h5;
h[6] = (int32_t)h6;
h[7] = (int32_t)h7;
h[8] = (int32_t)h8;
h[9] = (int32_t)h9;
}
/*
@ -99,76 +99,77 @@ fe25519_frombytes(fe25519 h, const unsigned char *s)
static void
fe25519_reduce(fe25519 h, const fe25519 f)
{
int32_t h0 = f[0];
int32_t h1 = f[1];
int32_t h2 = f[2];
int32_t h3 = f[3];
int32_t h4 = f[4];
int32_t h5 = f[5];
int32_t h6 = f[6];
int32_t h7 = f[7];
int32_t h8 = f[8];
int32_t h9 = f[9];
int32_t q;
int32_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9;
q = (19 * h9 + ((uint32_t) 1L << 24)) >> 25;
q = (h0 + q) >> 26;
q = (h1 + q) >> 25;
q = (h2 + q) >> 26;
q = (h3 + q) >> 25;
q = (h4 + q) >> 26;
q = (h5 + q) >> 25;
q = (h6 + q) >> 26;
q = (h7 + q) >> 25;
q = (h8 + q) >> 26;
q = (h9 + q) >> 25;
/* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
h0 += 19 * q;
/* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
carry0 = h0 >> 26;
h1 += carry0;
h0 -= carry0 * ((uint32_t) 1L << 26);
carry1 = h1 >> 25;
h2 += carry1;
h1 -= carry1 * ((uint32_t) 1L << 25);
carry2 = h2 >> 26;
h3 += carry2;
h2 -= carry2 * ((uint32_t) 1L << 26);
carry3 = h3 >> 25;
h4 += carry3;
h3 -= carry3 * ((uint32_t) 1L << 25);
carry4 = h4 >> 26;
h5 += carry4;
h4 -= carry4 * ((uint32_t) 1L << 26);
carry5 = h5 >> 25;
h6 += carry5;
h5 -= carry5 * ((uint32_t) 1L << 25);
carry6 = h6 >> 26;
h7 += carry6;
h6 -= carry6 * ((uint32_t) 1L << 26);
carry7 = h7 >> 25;
h8 += carry7;
h7 -= carry7 * ((uint32_t) 1L << 25);
carry8 = h8 >> 26;
h9 += carry8;
h8 -= carry8 * ((uint32_t) 1L << 26);
carry9 = h9 >> 25;
h9 -= carry9 * ((uint32_t) 1L << 25);
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
h[5] = h5;
h[6] = h6;
h[7] = h7;
h[8] = h8;
h[9] = h9;
int32_t h0 = f[0];
int32_t h1 = f[1];
int32_t h2 = f[2];
int32_t h3 = f[3];
int32_t h4 = f[4];
int32_t h5 = f[5];
int32_t h6 = f[6];
int32_t h7 = f[7];
int32_t h8 = f[8];
int32_t h9 = f[9];
int32_t q;
int32_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7,
carry8, carry9;
q = (19 * h9 + ((uint32_t)1L << 24)) >> 25;
q = (h0 + q) >> 26;
q = (h1 + q) >> 25;
q = (h2 + q) >> 26;
q = (h3 + q) >> 25;
q = (h4 + q) >> 26;
q = (h5 + q) >> 25;
q = (h6 + q) >> 26;
q = (h7 + q) >> 25;
q = (h8 + q) >> 26;
q = (h9 + q) >> 25;
/* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
h0 += 19 * q;
/* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
carry0 = h0 >> 26;
h1 += carry0;
h0 -= carry0 * ((uint32_t)1L << 26);
carry1 = h1 >> 25;
h2 += carry1;
h1 -= carry1 * ((uint32_t)1L << 25);
carry2 = h2 >> 26;
h3 += carry2;
h2 -= carry2 * ((uint32_t)1L << 26);
carry3 = h3 >> 25;
h4 += carry3;
h3 -= carry3 * ((uint32_t)1L << 25);
carry4 = h4 >> 26;
h5 += carry4;
h4 -= carry4 * ((uint32_t)1L << 26);
carry5 = h5 >> 25;
h6 += carry5;
h5 -= carry5 * ((uint32_t)1L << 25);
carry6 = h6 >> 26;
h7 += carry6;
h6 -= carry6 * ((uint32_t)1L << 26);
carry7 = h7 >> 25;
h8 += carry7;
h7 -= carry7 * ((uint32_t)1L << 25);
carry8 = h8 >> 26;
h9 += carry8;
h8 -= carry8 * ((uint32_t)1L << 26);
carry9 = h9 >> 25;
h9 -= carry9 * ((uint32_t)1L << 25);
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
h[5] = h5;
h[6] = h6;
h[7] = h7;
h[8] = h8;
h[9] = h9;
}
/*
@ -182,39 +183,39 @@ fe25519_reduce(fe25519 h, const fe25519 f)
void
fe25519_tobytes(unsigned char *s, const fe25519 h)
{
fe25519 t;
fe25519_reduce(t, h);
s[0] = t[0] >> 0;
s[1] = t[0] >> 8;
s[2] = t[0] >> 16;
s[3] = (t[0] >> 24) | (t[1] * ((uint32_t) 1 << 2));
s[4] = t[1] >> 6;
s[5] = t[1] >> 14;
s[6] = (t[1] >> 22) | (t[2] * ((uint32_t) 1 << 3));
s[7] = t[2] >> 5;
s[8] = t[2] >> 13;
s[9] = (t[2] >> 21) | (t[3] * ((uint32_t) 1 << 5));
s[10] = t[3] >> 3;
s[11] = t[3] >> 11;
s[12] = (t[3] >> 19) | (t[4] * ((uint32_t) 1 << 6));
s[13] = t[4] >> 2;
s[14] = t[4] >> 10;
s[15] = t[4] >> 18;
s[16] = t[5] >> 0;
s[17] = t[5] >> 8;
s[18] = t[5] >> 16;
s[19] = (t[5] >> 24) | (t[6] * ((uint32_t) 1 << 1));
s[20] = t[6] >> 7;
s[21] = t[6] >> 15;
s[22] = (t[6] >> 23) | (t[7] * ((uint32_t) 1 << 3));
s[23] = t[7] >> 5;
s[24] = t[7] >> 13;
s[25] = (t[7] >> 21) | (t[8] * ((uint32_t) 1 << 4));
s[26] = t[8] >> 4;
s[27] = t[8] >> 12;
s[28] = (t[8] >> 20) | (t[9] * ((uint32_t) 1 << 6));
s[29] = t[9] >> 2;
s[30] = t[9] >> 10;
s[31] = t[9] >> 18;
fe25519 t;
fe25519_reduce(t, h);
s[0] = t[0] >> 0;
s[1] = t[0] >> 8;
s[2] = t[0] >> 16;
s[3] = (t[0] >> 24) | (t[1] * ((uint32_t)1 << 2));
s[4] = t[1] >> 6;
s[5] = t[1] >> 14;
s[6] = (t[1] >> 22) | (t[2] * ((uint32_t)1 << 3));
s[7] = t[2] >> 5;
s[8] = t[2] >> 13;
s[9] = (t[2] >> 21) | (t[3] * ((uint32_t)1 << 5));
s[10] = t[3] >> 3;
s[11] = t[3] >> 11;
s[12] = (t[3] >> 19) | (t[4] * ((uint32_t)1 << 6));
s[13] = t[4] >> 2;
s[14] = t[4] >> 10;
s[15] = t[4] >> 18;
s[16] = t[5] >> 0;
s[17] = t[5] >> 8;
s[18] = t[5] >> 16;
s[19] = (t[5] >> 24) | (t[6] * ((uint32_t)1 << 1));
s[20] = t[6] >> 7;
s[21] = t[6] >> 15;
s[22] = (t[6] >> 23) | (t[7] * ((uint32_t)1 << 3));
s[23] = t[7] >> 5;
s[24] = t[7] >> 13;
s[25] = (t[7] >> 21) | (t[8] * ((uint32_t)1 << 4));
s[26] = t[8] >> 4;
s[27] = t[8] >> 12;
s[28] = (t[8] >> 20) | (t[9] * ((uint32_t)1 << 6));
s[29] = t[9] >> 2;
s[30] = t[9] >> 10;
s[31] = t[9] >> 18;
}

@ -17,4 +17,3 @@
#define REDMASK51 crypto_scalarmult_curve25519_sandy2x_REDMASK51
#endif /* ifndef consts_namespace_H */

@ -26,13 +26,14 @@ crypto_scalarmult_curve25519_sandy2x(unsigned char *q, const unsigned char *n,
const unsigned char *p)
{
unsigned char *t = q;
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for (i = 0; i < 32; i++) {
t[i] = n[i];
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for(i = 0; i < 32; i++)
{
t[i] = n[i];
}
t[0] &= 248;
t[31] &= 127;
@ -72,13 +73,14 @@ crypto_scalarmult_curve25519_sandy2x_base(unsigned char *q,
const unsigned char *n)
{
unsigned char *t = q;
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for (i = 0;i < 32; i++) {
t[i] = n[i];
fe var[3];
fe51 x_51;
fe51 z_51;
unsigned int i;
for(i = 0; i < 32; i++)
{
t[i] = n[i];
}
t[0] &= 248;
t[31] &= 127;
@ -106,9 +108,8 @@ crypto_scalarmult_curve25519_sandy2x_base(unsigned char *q,
}
struct crypto_scalarmult_curve25519_implementation
crypto_scalarmult_curve25519_sandy2x_implementation = {
SODIUM_C99(.mult = ) crypto_scalarmult_curve25519_sandy2x,
SODIUM_C99(.mult_base = ) crypto_scalarmult_curve25519_sandy2x_base
};
crypto_scalarmult_curve25519_sandy2x_implementation = {
SODIUM_C99(.mult =) crypto_scalarmult_curve25519_sandy2x,
SODIUM_C99(.mult_base =) crypto_scalarmult_curve25519_sandy2x_base};
#endif

@ -21,6 +21,7 @@ Bounds on each t[i] vary depending on context.
#define fe_frombytes crypto_scalarmult_curve25519_sandy2x_fe_frombytes
extern void fe_frombytes(fe, const unsigned char *);
extern void
fe_frombytes(fe, const unsigned char *);
#endif

@ -9,7 +9,8 @@
#define fe51_H
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#include <stdint.h>
@ -17,16 +18,19 @@ extern "C" {
#include "fe51_namespace.h"
typedef struct
{
typedef struct
{
uint64_t v[5];
}
fe51;
extern void fe51_pack(unsigned char *, const fe51 *);
extern void fe51_mul(fe51 *, const fe51 *, const fe51 *);
extern void fe51_nsquare(fe51 *, const fe51 *, int);
extern void fe51_invert(fe51 *, const fe51 *);
} fe51;
extern void
fe51_pack(unsigned char *, const fe51 *);
extern void
fe51_mul(fe51 *, const fe51 *, const fe51 *);
extern void
fe51_nsquare(fe51 *, const fe51 *, int);
extern void
fe51_invert(fe51 *, const fe51 *);
#ifdef __cplusplus
}

@ -12,47 +12,47 @@
void
fe51_invert(fe51 *r, const fe51 *x)
{
fe51 z2;
fe51 z9;
fe51 z11;
fe51 z2_5_0;
fe51 z2_10_0;
fe51 z2_20_0;
fe51 z2_50_0;
fe51 z2_100_0;
fe51 t;
/* 2 */ fe51_square(&z2,x);
/* 4 */ fe51_square(&t,&z2);
/* 8 */ fe51_square(&t,&t);
/* 9 */ fe51_mul(&z9,&t,x);
/* 11 */ fe51_mul(&z11,&z9,&z2);
/* 22 */ fe51_square(&t,&z11);
/* 2^5 - 2^0 = 31 */ fe51_mul(&z2_5_0,&t,&z9);
/* 2^10 - 2^5 */ fe51_nsquare(&t,&z2_5_0, 5);
/* 2^10 - 2^0 */ fe51_mul(&z2_10_0,&t,&z2_5_0);
/* 2^20 - 2^10 */ fe51_nsquare(&t,&z2_10_0, 10);
/* 2^20 - 2^0 */ fe51_mul(&z2_20_0,&t,&z2_10_0);
/* 2^40 - 2^20 */ fe51_nsquare(&t,&z2_20_0, 20);
/* 2^40 - 2^0 */ fe51_mul(&t,&t,&z2_20_0);
/* 2^50 - 2^10 */ fe51_nsquare(&t,&t,10);
/* 2^50 - 2^0 */ fe51_mul(&z2_50_0,&t,&z2_10_0);
/* 2^100 - 2^50 */ fe51_nsquare(&t,&z2_50_0, 50);
/* 2^100 - 2^0 */ fe51_mul(&z2_100_0,&t,&z2_50_0);
/* 2^200 - 2^100 */ fe51_nsquare(&t,&z2_100_0, 100);
/* 2^200 - 2^0 */ fe51_mul(&t,&t,&z2_100_0);
/* 2^250 - 2^50 */ fe51_nsquare(&t,&t, 50);
/* 2^250 - 2^0 */ fe51_mul(&t,&t,&z2_50_0);
/* 2^255 - 2^5 */ fe51_nsquare(&t,&t,5);
/* 2^255 - 21 */ fe51_mul(r,&t,&z11);
fe51 z2;
fe51 z9;
fe51 z11;
fe51 z2_5_0;
fe51 z2_10_0;
fe51 z2_20_0;
fe51 z2_50_0;
fe51 z2_100_0;
fe51 t;
/* 2 */ fe51_square(&z2, x);
/* 4 */ fe51_square(&t, &z2);
/* 8 */ fe51_square(&t, &t);
/* 9 */ fe51_mul(&z9, &t, x);
/* 11 */ fe51_mul(&z11, &z9, &z2);
/* 22 */ fe51_square(&t, &z11);
/* 2^5 - 2^0 = 31 */ fe51_mul(&z2_5_0, &t, &z9);
/* 2^10 - 2^5 */ fe51_nsquare(&t, &z2_5_0, 5);
/* 2^10 - 2^0 */ fe51_mul(&z2_10_0, &t, &z2_5_0);
/* 2^20 - 2^10 */ fe51_nsquare(&t, &z2_10_0, 10);
/* 2^20 - 2^0 */ fe51_mul(&z2_20_0, &t, &z2_10_0);
/* 2^40 - 2^20 */ fe51_nsquare(&t, &z2_20_0, 20);
/* 2^40 - 2^0 */ fe51_mul(&t, &t, &z2_20_0);
/* 2^50 - 2^10 */ fe51_nsquare(&t, &t, 10);
/* 2^50 - 2^0 */ fe51_mul(&z2_50_0, &t, &z2_10_0);
/* 2^100 - 2^50 */ fe51_nsquare(&t, &z2_50_0, 50);
/* 2^100 - 2^0 */ fe51_mul(&z2_100_0, &t, &z2_50_0);
/* 2^200 - 2^100 */ fe51_nsquare(&t, &z2_100_0, 100);
/* 2^200 - 2^0 */ fe51_mul(&t, &t, &z2_100_0);
/* 2^250 - 2^50 */ fe51_nsquare(&t, &t, 50);
/* 2^250 - 2^0 */ fe51_mul(&t, &t, &z2_50_0);
/* 2^255 - 2^5 */ fe51_nsquare(&t, &t, 5);
/* 2^255 - 21 */ fe51_mul(r, &t, &z11);
}
#endif

@ -1,16 +1,15 @@
#ifndef fe51_namespace_H
#define fe51_namespace_H
#define fe51 crypto_scalarmult_curve25519_sandy2x_fe51
#define _fe51 _crypto_scalarmult_curve25519_sandy2x_fe51
#define fe51_pack crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define _fe51_pack _crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define fe51_mul crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define _fe51_mul _crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define fe51_nsquare crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define _fe51_nsquare _crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define fe51 crypto_scalarmult_curve25519_sandy2x_fe51
#define _fe51 _crypto_scalarmult_curve25519_sandy2x_fe51
#define fe51_pack crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define _fe51_pack _crypto_scalarmult_curve25519_sandy2x_fe51_pack
#define fe51_mul crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define _fe51_mul _crypto_scalarmult_curve25519_sandy2x_fe51_mul
#define fe51_nsquare crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define _fe51_nsquare _crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
#define fe51_invert crypto_scalarmult_curve25519_sandy2x_fe51_invert
#define fe51_invert crypto_scalarmult_curve25519_sandy2x_fe51_invert
#endif /* ifndef fe51_namespace_H */

@ -10,9 +10,9 @@ static uint64_t
load_3(const unsigned char *in)
{
uint64_t result;
result = (uint64_t) in[0];
result |= ((uint64_t) in[1]) << 8;
result |= ((uint64_t) in[2]) << 16;
result = (uint64_t)in[0];
result |= ((uint64_t)in[1]) << 8;
result |= ((uint64_t)in[2]) << 16;
return result;
}
@ -20,10 +20,10 @@ static uint64_t
load_4(const unsigned char *in)
{
uint64_t result;
result = (uint64_t) in[0];
result |= ((uint64_t) in[1]) << 8;
result |= ((uint64_t) in[2]) << 16;
result |= ((uint64_t) in[3]) << 24;
result = (uint64_t)in[0];
result |= ((uint64_t)in[1]) << 8;
result |= ((uint64_t)in[2]) << 16;
result |= ((uint64_t)in[3]) << 24;
return result;
}
@ -51,17 +51,37 @@ fe_frombytes(fe h, const unsigned char *s)
uint64_t carry8;
uint64_t carry9;
carry9 = h9 >> 25; h0 += carry9 * 19; h9 &= 0x1FFFFFF;
carry1 = h1 >> 25; h2 += carry1; h1 &= 0x1FFFFFF;
carry3 = h3 >> 25; h4 += carry3; h3 &= 0x1FFFFFF;
carry5 = h5 >> 25; h6 += carry5; h5 &= 0x1FFFFFF;
carry7 = h7 >> 25; h8 += carry7; h7 &= 0x1FFFFFF;
carry9 = h9 >> 25;
h0 += carry9 * 19;
h9 &= 0x1FFFFFF;
carry1 = h1 >> 25;
h2 += carry1;
h1 &= 0x1FFFFFF;
carry3 = h3 >> 25;
h4 += carry3;
h3 &= 0x1FFFFFF;
carry5 = h5 >> 25;
h6 += carry5;
h5 &= 0x1FFFFFF;
carry7 = h7 >> 25;
h8 += carry7;
h7 &= 0x1FFFFFF;
carry0 = h0 >> 26; h1 += carry0; h0 &= 0x3FFFFFF;
carry2 = h2 >> 26; h3 += carry2; h2 &= 0x3FFFFFF;
carry4 = h4 >> 26; h5 += carry4; h4 &= 0x3FFFFFF;
carry6 = h6 >> 26; h7 += carry6; h6 &= 0x3FFFFFF;
carry8 = h8 >> 26; h9 += carry8; h8 &= 0x3FFFFFF;
carry0 = h0 >> 26;
h1 += carry0;
h0 &= 0x3FFFFFF;
carry2 = h2 >> 26;
h3 += carry2;
h2 &= 0x3FFFFFF;
carry4 = h4 >> 26;
h5 += carry4;
h4 &= 0x3FFFFFF;
carry6 = h6 >> 26;
h7 += carry6;
h6 &= 0x3FFFFFF;
carry8 = h8 >> 26;
h9 += carry8;
h8 &= 0x3FFFFFF;
h[0] = h0;
h[1] = h1;

@ -2,17 +2,18 @@
#define ladder_H
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#include "fe.h"
#include "ladder_namespace.h"
extern void ladder(fe *, const unsigned char *);
extern void
ladder(fe *, const unsigned char *);
#ifdef __cplusplus
}
#endif
#endif /* ifndef ladder_H */

@ -2,17 +2,18 @@
#define ladder_base_H
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#include "fe.h"
#include "ladder_base_namespace.h"
extern void ladder_base(fe *, const unsigned char *);
extern void
ladder_base(fe *, const unsigned char *);
#ifdef __cplusplus
}
#endif
#endif /* ifndef ladder_base_H */

@ -1,8 +1,7 @@
#ifndef ladder_base_namespace_H
#define ladder_base_namespace_H
#define ladder_base crypto_scalarmult_curve25519_sandy2x_ladder_base
#define ladder_base crypto_scalarmult_curve25519_sandy2x_ladder_base
#define _ladder_base _crypto_scalarmult_curve25519_sandy2x_ladder_base
#endif /* ifndef ladder_base_namespace_H */

@ -1,8 +1,7 @@
#ifndef ladder_namespace_H
#define ladder_namespace_H
#define ladder crypto_scalarmult_curve25519_sandy2x_ladder
#define ladder crypto_scalarmult_curve25519_sandy2x_ladder
#define _ladder _crypto_scalarmult_curve25519_sandy2x_ladder
#endif /* ifndef ladder_namespace_H */

@ -2,10 +2,10 @@
#ifndef scalarmult_poly1305_H
#define scalarmult_poly1305_H
typedef struct crypto_scalarmult_curve25519_implementation {
int (*mult)(unsigned char *q, const unsigned char *n,
const unsigned char *p);
int (*mult_base)(unsigned char *q, const unsigned char *n);
typedef struct crypto_scalarmult_curve25519_implementation
{
int (*mult)(unsigned char *q, const unsigned char *n, const unsigned char *p);
int (*mult_base)(unsigned char *q, const unsigned char *n);
} crypto_scalarmult_curve25519_implementation;
#endif

@ -1,18 +1,17 @@
#ifndef sign_ed25519_ref10_H
#define sign_ed25519_ref10_H
void _crypto_sign_ed25519_ref10_hinit(crypto_hash_sha512_state *hs,
int prehashed);
void
_crypto_sign_ed25519_ref10_hinit(crypto_hash_sha512_state *hs, int prehashed);
int _crypto_sign_ed25519_detached(unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk, int prehashed);
int
_crypto_sign_ed25519_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk, int prehashed);
int _crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk,
int prehashed);
int
_crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk, int prehashed);
#endif

@ -1,12 +1,22 @@
int crypto_kem_enc_ref(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
int crypto_kem_dec_ref(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
int
crypto_kem_enc_ref(unsigned char *cstr, unsigned char *k,
const unsigned char *pk);
int crypto_kem_keypair_ref(unsigned char *pk, unsigned char * sk);
int
crypto_kem_dec_ref(unsigned char *k, const unsigned char *cstr,
const unsigned char *sk);
int crypto_kem_enc_avx2(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
int crypto_kem_dec_avx2(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
int
crypto_kem_keypair_ref(unsigned char *pk, unsigned char *sk);
int crypto_kem_keypair_avx2(unsigned char *pk, unsigned char * sk);
int
crypto_kem_enc_avx2(unsigned char *cstr, unsigned char *k,
const unsigned char *pk);
int
crypto_kem_dec_avx2(unsigned char *k, const unsigned char *cstr,
const unsigned char *sk);
int
crypto_kem_keypair_avx2(unsigned char *pk, unsigned char *sk);

@ -5,42 +5,42 @@
#include <stdlib.h>
#include <string.h>
#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
#define COMPILER_ASSERT(X) (void)sizeof(char[(X) ? 1 : -1])
#ifdef HAVE_TI_MODE
# if defined(__SIZEOF_INT128__)
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
# else
#else
typedef unsigned uint128_t __attribute__((mode(TI)));
# endif
#endif
#endif
#define ROTL32(X, B) rotl32((X), (B))
static inline uint32_t
rotl32(const uint32_t x, const int b)
{
return (x << b) | (x >> (32 - b));
return (x << b) | (x >> (32 - b));
}
#define ROTL64(X, B) rotl64((X), (B))
static inline uint64_t
rotl64(const uint64_t x, const int b)
{
return (x << b) | (x >> (64 - b));
return (x << b) | (x >> (64 - b));
}
#define ROTR32(X, B) rotr32((X), (B))
static inline uint32_t
rotr32(const uint32_t x, const int b)
{
return (x >> b) | (x << (32 - b));
return (x >> b) | (x << (32 - b));
}
#define ROTR64(X, B) rotr64((X), (B))
static inline uint64_t
rotr64(const uint64_t x, const int b)
{
return (x >> b) | (x << (64 - b));
return (x >> b) | (x << (64 - b));
}
#define LOAD64_LE(SRC) load64_le(SRC)
@ -48,19 +48,19 @@ static inline uint64_t
load64_le(const uint8_t src[8])
{
#ifdef NATIVE_LITTLE_ENDIAN
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint64_t w = (uint64_t) src[0];
w |= (uint64_t) src[1] << 8;
w |= (uint64_t) src[2] << 16;
w |= (uint64_t) src[3] << 24;
w |= (uint64_t) src[4] << 32;
w |= (uint64_t) src[5] << 40;
w |= (uint64_t) src[6] << 48;
w |= (uint64_t) src[7] << 56;
return w;
uint64_t w = (uint64_t)src[0];
w |= (uint64_t)src[1] << 8;
w |= (uint64_t)src[2] << 16;
w |= (uint64_t)src[3] << 24;
w |= (uint64_t)src[4] << 32;
w |= (uint64_t)src[5] << 40;
w |= (uint64_t)src[6] << 48;
w |= (uint64_t)src[7] << 56;
return w;
#endif
}
@ -69,16 +69,23 @@ static inline void
store64_le(uint8_t dst[8], uint64_t w)
{
#ifdef NATIVE_LITTLE_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[7] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[7] = (uint8_t)w;
#endif
}
@ -87,15 +94,15 @@ static inline uint32_t
load32_le(const uint8_t src[4])
{
#ifdef NATIVE_LITTLE_ENDIAN
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint32_t w = (uint32_t) src[0];
w |= (uint32_t) src[1] << 8;
w |= (uint32_t) src[2] << 16;
w |= (uint32_t) src[3] << 24;
return w;
uint32_t w = (uint32_t)src[0];
w |= (uint32_t)src[1] << 8;
w |= (uint32_t)src[2] << 16;
w |= (uint32_t)src[3] << 24;
return w;
#endif
}
@ -104,12 +111,15 @@ static inline void
store32_le(uint8_t dst[4], uint32_t w)
{
#ifdef NATIVE_LITTLE_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
#endif
}
@ -120,19 +130,19 @@ static inline uint64_t
load64_be(const uint8_t src[8])
{
#ifdef NATIVE_BIG_ENDIAN
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint64_t w = (uint64_t) src[7];
w |= (uint64_t) src[6] << 8;
w |= (uint64_t) src[5] << 16;
w |= (uint64_t) src[4] << 24;
w |= (uint64_t) src[3] << 32;
w |= (uint64_t) src[2] << 40;
w |= (uint64_t) src[1] << 48;
w |= (uint64_t) src[0] << 56;
return w;
uint64_t w = (uint64_t)src[7];
w |= (uint64_t)src[6] << 8;
w |= (uint64_t)src[5] << 16;
w |= (uint64_t)src[4] << 24;
w |= (uint64_t)src[3] << 32;
w |= (uint64_t)src[2] << 40;
w |= (uint64_t)src[1] << 48;
w |= (uint64_t)src[0] << 56;
return w;
#endif
}
@ -141,16 +151,23 @@ static inline void
store64_be(uint8_t dst[8], uint64_t w)
{
#ifdef NATIVE_BIG_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[7] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[7] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
#endif
}
@ -159,15 +176,15 @@ static inline uint32_t
load32_be(const uint8_t src[4])
{
#ifdef NATIVE_BIG_ENDIAN
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint32_t w = (uint32_t) src[3];
w |= (uint32_t) src[2] << 8;
w |= (uint32_t) src[1] << 16;
w |= (uint32_t) src[0] << 24;
return w;
uint32_t w = (uint32_t)src[3];
w |= (uint32_t)src[2] << 8;
w |= (uint32_t)src[1] << 16;
w |= (uint32_t)src[0] << 24;
return w;
#endif
}
@ -176,12 +193,15 @@ static inline void
store32_be(uint8_t dst[4], uint32_t w)
{
#ifdef NATIVE_BIG_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
#endif
}
@ -189,58 +209,61 @@ store32_be(uint8_t dst[4], uint32_t w)
static inline void
xor_buf(unsigned char *out, const unsigned char *in, size_t n)
{
size_t i;
size_t i;
for (i = 0; i < n; i++) {
out[i] ^= in[i];
}
for(i = 0; i < n; i++)
{
out[i] ^= in[i];
}
}
#if !defined(__clang__) && !defined(__GNUC__)
# ifdef __attribute__
# undef __attribute__
# endif
# define __attribute__(a)
#ifdef __attribute__
#undef __attribute__
#endif
#define __attribute__(a)
#endif
#ifndef CRYPTO_ALIGN
# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
# define CRYPTO_ALIGN(x) __declspec(align(x))
# else
# define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
# endif
#endif
#if defined(_MSC_VER) && \
(defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
# include <intrin.h>
# define HAVE_INTRIN_H 1
# define HAVE_MMINTRIN_H 1
# define HAVE_EMMINTRIN_H 1
# define HAVE_PMMINTRIN_H 1
# define HAVE_TMMINTRIN_H 1
# define HAVE_SMMINTRIN_H 1
# define HAVE_AVXINTRIN_H 1
# if _MSC_VER >= 1600
# define HAVE_WMMINTRIN_H 1
# endif
# if _MSC_VER >= 1700 && defined(_M_X64)
# define HAVE_AVX2INTRIN_H 1
# endif
#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
#define CRYPTO_ALIGN(x) __declspec(align(x))
#else
#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif
#if defined(_MSC_VER) \
&& (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
#include <intrin.h>
#define HAVE_INTRIN_H 1
#define HAVE_MMINTRIN_H 1
#define HAVE_EMMINTRIN_H 1
#define HAVE_PMMINTRIN_H 1
#define HAVE_TMMINTRIN_H 1
#define HAVE_SMMINTRIN_H 1
#define HAVE_AVXINTRIN_H 1
#if _MSC_VER >= 1600
#define HAVE_WMMINTRIN_H 1
#endif
#if _MSC_VER >= 1700 && defined(_M_X64)
#define HAVE_AVX2INTRIN_H 1
#endif
#elif defined(HAVE_INTRIN_H)
# include <intrin.h>
#include <intrin.h>
#endif
#ifdef HAVE_LIBCTGRIND
extern void ct_poison (const void *, size_t);
extern void ct_unpoison(const void *, size_t);
# define POISON(X, L) ct_poison((X), (L))
# define UNPOISON(X, L) ct_unpoison((X), (L))
extern void
ct_poison(const void *, size_t);
extern void
ct_unpoison(const void *, size_t);
#define POISON(X, L) ct_poison((X), (L))
#define UNPOISON(X, L) ct_unpoison((X), (L))
#else
# define POISON(X, L) (void) 0
# define UNPOISON(X, L) (void) 0
#define POISON(X, L) (void)0
#define UNPOISON(X, L) (void)0
#endif
#endif

@ -14,157 +14,189 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_box_SEEDBYTES crypto_box_curve25519xsalsa20poly1305_SEEDBYTES
SODIUM_EXPORT
size_t crypto_box_seedbytes(void);
#define crypto_box_PUBLICKEYBYTES crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES
SODIUM_EXPORT
size_t crypto_box_publickeybytes(void);
#define crypto_box_SECRETKEYBYTES crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES
SODIUM_EXPORT
size_t crypto_box_secretkeybytes(void);
SODIUM_EXPORT
size_t
crypto_box_seedbytes(void);
#define crypto_box_PUBLICKEYBYTES \
crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES
SODIUM_EXPORT
size_t
crypto_box_publickeybytes(void);
#define crypto_box_SECRETKEYBYTES \
crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES
SODIUM_EXPORT
size_t
crypto_box_secretkeybytes(void);
#define crypto_box_NONCEBYTES crypto_box_curve25519xsalsa20poly1305_NONCEBYTES
SODIUM_EXPORT
size_t crypto_box_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_box_noncebytes(void);
#define crypto_box_MACBYTES crypto_box_curve25519xsalsa20poly1305_MACBYTES
SODIUM_EXPORT
size_t crypto_box_macbytes(void);
SODIUM_EXPORT
size_t
crypto_box_macbytes(void);
#define crypto_box_MESSAGEBYTES_MAX crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX
SODIUM_EXPORT
size_t crypto_box_messagebytes_max(void);
#define crypto_box_MESSAGEBYTES_MAX \
crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX
SODIUM_EXPORT
size_t
crypto_box_messagebytes_max(void);
#define crypto_box_PRIMITIVE "curve25519xsalsa20poly1305"
SODIUM_EXPORT
const char *crypto_box_primitive(void);
SODIUM_EXPORT
int crypto_box_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int crypto_box_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int crypto_box_easy(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_open_easy(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_detached(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_open_detached(unsigned char *m, const unsigned char *c,
const unsigned char *mac,
unsigned long long clen,
const unsigned char *n,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
/* -- Precomputation interface -- */
#define crypto_box_BEFORENMBYTES crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES
SODIUM_EXPORT
size_t crypto_box_beforenmbytes(void);
SODIUM_EXPORT
int crypto_box_beforenm(unsigned char *k, const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_easy_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_box_open_easy_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_detached_afternm(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int crypto_box_open_detached_afternm(unsigned char *m, const unsigned char *c,
const unsigned char *mac,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
/* -- Ephemeral SK interface -- */
SODIUM_EXPORT
const char *
crypto_box_primitive(void);
SODIUM_EXPORT
int
crypto_box_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int
crypto_box_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int
crypto_box_easy(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_open_easy(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_detached(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_open_detached(unsigned char *m, const unsigned char *c,
const unsigned char *mac, unsigned long long clen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk)
__attribute__((warn_unused_result));
/* -- Precomputation interface -- */
#define crypto_box_BEFORENMBYTES \
crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES
SODIUM_EXPORT
size_t
crypto_box_beforenmbytes(void);
SODIUM_EXPORT
int
crypto_box_beforenm(unsigned char *k, const unsigned char *pk,
const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_easy_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_box_open_easy_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_detached_afternm(unsigned char *c, unsigned char *mac,
const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int
crypto_box_open_detached_afternm(unsigned char *m, const unsigned char *c,
const unsigned char *mac,
unsigned long long clen,
const unsigned char *n,
const unsigned char *k)
__attribute__((warn_unused_result));
/* -- Ephemeral SK interface -- */
#define crypto_box_SEALBYTES (crypto_box_PUBLICKEYBYTES + crypto_box_MACBYTES)
SODIUM_EXPORT
size_t crypto_box_sealbytes(void);
SODIUM_EXPORT
size_t
crypto_box_sealbytes(void);
SODIUM_EXPORT
int crypto_box_seal(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *pk);
SODIUM_EXPORT
int
crypto_box_seal(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *pk);
SODIUM_EXPORT
int crypto_box_seal_open(unsigned char *m, const unsigned char *c,
unsigned long long clen,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_seal_open(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *pk,
const unsigned char *sk)
__attribute__((warn_unused_result));
/* -- NaCl compatibility interface ; Requires padding -- */
/* -- NaCl compatibility interface ; Requires padding -- */
#define crypto_box_ZEROBYTES crypto_box_curve25519xsalsa20poly1305_ZEROBYTES
SODIUM_EXPORT
size_t crypto_box_zerobytes(void);
#define crypto_box_BOXZEROBYTES crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES
SODIUM_EXPORT
size_t crypto_box_boxzerobytes(void);
SODIUM_EXPORT
int crypto_box(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_open(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_box_open_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
size_t
crypto_box_zerobytes(void);
#define crypto_box_BOXZEROBYTES \
crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES
SODIUM_EXPORT
size_t
crypto_box_boxzerobytes(void);
SODIUM_EXPORT
int
crypto_box(unsigned char *c, const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk,
const unsigned char *sk) __attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_open(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *pk, const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_afternm(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_box_open_afternm(unsigned char *m, const unsigned char *c,
unsigned long long clen, const unsigned char *n,
const unsigned char *k)
__attribute__((warn_unused_result));
#ifdef __cplusplus
}

@ -6,101 +6,114 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_box_curve25519xsalsa20poly1305_SEEDBYTES 32U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_seedbytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_seedbytes(void);
#define crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES 32U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_publickeybytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_publickeybytes(void);
#define crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES 32U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_secretkeybytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_secretkeybytes(void);
#define crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES 32U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_beforenmbytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_beforenmbytes(void);
#define crypto_box_curve25519xsalsa20poly1305_NONCEBYTES 24U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_noncebytes(void);
#define crypto_box_curve25519xsalsa20poly1305_MACBYTES 16U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_macbytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_macbytes(void);
/* Only for the libsodium API - The NaCl compatibility API would require BOXZEROBYTES extra bytes */
/* Only for the libsodium API - The NaCl compatibility API would require
* BOXZEROBYTES extra bytes */
#define crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX \
(crypto_stream_xsalsa20_MESSAGEBYTES_MAX - crypto_box_curve25519xsalsa20poly1305_MACBYTES)
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_messagebytes_max(void);
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_seed_keypair(unsigned char *pk,
unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_keypair(unsigned char *pk,
unsigned char *sk);
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_beforenm(unsigned char *k,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
/* -- NaCl compatibility interface ; Requires padding -- */
(crypto_stream_xsalsa20_MESSAGEBYTES_MAX \
- crypto_box_curve25519xsalsa20poly1305_MACBYTES)
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_messagebytes_max(void);
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_seed_keypair(unsigned char *pk,
unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_keypair(unsigned char *pk,
unsigned char *sk);
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_beforenm(unsigned char *k,
const unsigned char *pk,
const unsigned char *sk)
__attribute__((warn_unused_result));
/* -- NaCl compatibility interface ; Requires padding -- */
#define crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES 16U
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_boxzerobytes(void);
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_boxzerobytes(void);
#define crypto_box_curve25519xsalsa20poly1305_ZEROBYTES \
(crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES + \
crypto_box_curve25519xsalsa20poly1305_MACBYTES)
SODIUM_EXPORT
size_t crypto_box_curve25519xsalsa20poly1305_zerobytes(void);
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305(unsigned char *c,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_open(unsigned char *m,
const unsigned char *c,
unsigned long long clen,
const unsigned char *n,
const unsigned char *pk,
const unsigned char *sk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_afternm(unsigned char *c,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_box_curve25519xsalsa20poly1305_open_afternm(unsigned char *m,
const unsigned char *c,
unsigned long long clen,
const unsigned char *n,
const unsigned char *k)
__attribute__ ((warn_unused_result));
(crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES \
+ crypto_box_curve25519xsalsa20poly1305_MACBYTES)
SODIUM_EXPORT
size_t
crypto_box_curve25519xsalsa20poly1305_zerobytes(void);
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305(
unsigned char *c, const unsigned char *m, unsigned long long mlen,
const unsigned char *n, const unsigned char *pk, const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_open(
unsigned char *m, const unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *pk, const unsigned char *sk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_afternm(unsigned char *c,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_box_curve25519xsalsa20poly1305_open_afternm(unsigned char *m,
const unsigned char *c,
unsigned long long clen,
const unsigned char *n,
const unsigned char *k)
__attribute__((warn_unused_result));
#ifdef __cplusplus
}

@ -5,28 +5,34 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#define crypto_core_hchacha20_OUTPUTBYTES 32U
SODIUM_EXPORT
size_t crypto_core_hchacha20_outputbytes(void);
SODIUM_EXPORT
size_t
crypto_core_hchacha20_outputbytes(void);
#define crypto_core_hchacha20_INPUTBYTES 16U
SODIUM_EXPORT
size_t crypto_core_hchacha20_inputbytes(void);
SODIUM_EXPORT
size_t
crypto_core_hchacha20_inputbytes(void);
#define crypto_core_hchacha20_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_core_hchacha20_keybytes(void);
SODIUM_EXPORT
size_t
crypto_core_hchacha20_keybytes(void);
#define crypto_core_hchacha20_CONSTBYTES 16U
SODIUM_EXPORT
size_t crypto_core_hchacha20_constbytes(void);
SODIUM_EXPORT
int crypto_core_hchacha20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
SODIUM_EXPORT
size_t
crypto_core_hchacha20_constbytes(void);
SODIUM_EXPORT
int
crypto_core_hchacha20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
#ifdef __cplusplus
}

@ -5,28 +5,34 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#define crypto_core_salsa20_OUTPUTBYTES 64U
SODIUM_EXPORT
size_t crypto_core_salsa20_outputbytes(void);
SODIUM_EXPORT
size_t
crypto_core_salsa20_outputbytes(void);
#define crypto_core_salsa20_INPUTBYTES 16U
SODIUM_EXPORT
size_t crypto_core_salsa20_inputbytes(void);
SODIUM_EXPORT
size_t
crypto_core_salsa20_inputbytes(void);
#define crypto_core_salsa20_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_core_salsa20_keybytes(void);
SODIUM_EXPORT
size_t
crypto_core_salsa20_keybytes(void);
#define crypto_core_salsa20_CONSTBYTES 16U
SODIUM_EXPORT
size_t crypto_core_salsa20_constbytes(void);
SODIUM_EXPORT
int crypto_core_salsa20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
SODIUM_EXPORT
size_t
crypto_core_salsa20_constbytes(void);
SODIUM_EXPORT
int
crypto_core_salsa20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c);
#ifdef __cplusplus
}

@ -7,66 +7,79 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_generichash_BYTES_MIN crypto_generichash_blake2b_BYTES_MIN
SODIUM_EXPORT
size_t crypto_generichash_bytes_min(void);
SODIUM_EXPORT
size_t
crypto_generichash_bytes_min(void);
#define crypto_generichash_BYTES_MAX crypto_generichash_blake2b_BYTES_MAX
SODIUM_EXPORT
size_t crypto_generichash_bytes_max(void);
SODIUM_EXPORT
size_t
crypto_generichash_bytes_max(void);
#define crypto_generichash_BYTES crypto_generichash_blake2b_BYTES
SODIUM_EXPORT
size_t crypto_generichash_bytes(void);
SODIUM_EXPORT
size_t
crypto_generichash_bytes(void);
#define crypto_generichash_KEYBYTES_MIN crypto_generichash_blake2b_KEYBYTES_MIN
SODIUM_EXPORT
size_t crypto_generichash_keybytes_min(void);
SODIUM_EXPORT
size_t
crypto_generichash_keybytes_min(void);
#define crypto_generichash_KEYBYTES_MAX crypto_generichash_blake2b_KEYBYTES_MAX
SODIUM_EXPORT
size_t crypto_generichash_keybytes_max(void);
SODIUM_EXPORT
size_t
crypto_generichash_keybytes_max(void);
#define crypto_generichash_KEYBYTES crypto_generichash_blake2b_KEYBYTES
SODIUM_EXPORT
size_t crypto_generichash_keybytes(void);
SODIUM_EXPORT
size_t
crypto_generichash_keybytes(void);
#define crypto_generichash_PRIMITIVE "blake2b"
SODIUM_EXPORT
const char *crypto_generichash_primitive(void);
typedef crypto_generichash_blake2b_state crypto_generichash_state;
SODIUM_EXPORT
size_t crypto_generichash_statebytes(void);
SODIUM_EXPORT
int crypto_generichash(unsigned char *out, size_t outlen,
const unsigned char *in, unsigned long long inlen,
const unsigned char *key, size_t keylen);
SODIUM_EXPORT
int crypto_generichash_init(crypto_generichash_state *state,
const unsigned char *key,
const size_t keylen, const size_t outlen);
SODIUM_EXPORT
int crypto_generichash_update(crypto_generichash_state *state,
const unsigned char *in,
unsigned long long inlen);
SODIUM_EXPORT
int crypto_generichash_final(crypto_generichash_state *state,
unsigned char *out, const size_t outlen);
SODIUM_EXPORT
void crypto_generichash_keygen(unsigned char k[crypto_generichash_KEYBYTES]);
SODIUM_EXPORT
const char *
crypto_generichash_primitive(void);
typedef crypto_generichash_blake2b_state crypto_generichash_state;
SODIUM_EXPORT
size_t
crypto_generichash_statebytes(void);
SODIUM_EXPORT
int
crypto_generichash(unsigned char *out, size_t outlen, const unsigned char *in,
unsigned long long inlen, const unsigned char *key,
size_t keylen);
SODIUM_EXPORT
int
crypto_generichash_init(crypto_generichash_state *state,
const unsigned char *key, const size_t keylen,
const size_t outlen);
SODIUM_EXPORT
int
crypto_generichash_update(crypto_generichash_state *state,
const unsigned char *in, unsigned long long inlen);
SODIUM_EXPORT
int
crypto_generichash_final(crypto_generichash_state *state, unsigned char *out,
const size_t outlen);
SODIUM_EXPORT
void
crypto_generichash_keygen(unsigned char k[crypto_generichash_KEYBYTES]);
#ifdef __cplusplus
}

@ -8,107 +8,120 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
# pragma pack(1)
#pragma pack(1)
#else
# pragma pack(push, 1)
#pragma pack(push, 1)
#endif
typedef struct CRYPTO_ALIGN(64) crypto_generichash_blake2b_state {
typedef struct CRYPTO_ALIGN(64) crypto_generichash_blake2b_state
{
uint64_t h[8];
uint64_t t[2];
uint64_t f[2];
uint8_t buf[2 * 128];
size_t buflen;
uint8_t last_node;
} crypto_generichash_blake2b_state;
uint8_t buf[2 * 128];
size_t buflen;
uint8_t last_node;
} crypto_generichash_blake2b_state;
#if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
# pragma pack()
#pragma pack()
#else
# pragma pack(pop)
#pragma pack(pop)
#endif
#define crypto_generichash_blake2b_BYTES_MIN 16U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_bytes_min(void);
#define crypto_generichash_blake2b_BYTES_MAX 64U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_bytes_max(void);
#define crypto_generichash_blake2b_BYTES 32U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_bytes(void);
#define crypto_generichash_blake2b_KEYBYTES_MIN 16U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_keybytes_min(void);
#define crypto_generichash_blake2b_KEYBYTES_MAX 64U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_keybytes_max(void);
#define crypto_generichash_blake2b_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_keybytes(void);
#define crypto_generichash_blake2b_SALTBYTES 16U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_saltbytes(void);
#define crypto_generichash_blake2b_BYTES_MIN 16U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_bytes_min(void);
#define crypto_generichash_blake2b_BYTES_MAX 64U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_bytes_max(void);
#define crypto_generichash_blake2b_BYTES 32U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_bytes(void);
#define crypto_generichash_blake2b_KEYBYTES_MIN 16U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_keybytes_min(void);
#define crypto_generichash_blake2b_KEYBYTES_MAX 64U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_keybytes_max(void);
#define crypto_generichash_blake2b_KEYBYTES 32U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_keybytes(void);
#define crypto_generichash_blake2b_SALTBYTES 16U
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_saltbytes(void);
#define crypto_generichash_blake2b_PERSONALBYTES 16U
SODIUM_EXPORT
size_t crypto_generichash_blake2b_personalbytes(void);
SODIUM_EXPORT
size_t crypto_generichash_blake2b_statebytes(void);
SODIUM_EXPORT
int crypto_generichash_blake2b(unsigned char *out, size_t outlen,
const unsigned char *in,
unsigned long long inlen,
const unsigned char *key, size_t keylen);
SODIUM_EXPORT
int crypto_generichash_blake2b_salt_personal(unsigned char *out, size_t outlen,
const unsigned char *in,
unsigned long long inlen,
const unsigned char *key,
size_t keylen,
const unsigned char *salt,
const unsigned char *personal);
SODIUM_EXPORT
int crypto_generichash_blake2b_init(crypto_generichash_blake2b_state *state,
const unsigned char *key,
const size_t keylen, const size_t outlen);
SODIUM_EXPORT
int crypto_generichash_blake2b_init_salt_personal(crypto_generichash_blake2b_state *state,
const unsigned char *key,
const size_t keylen, const size_t outlen,
const unsigned char *salt,
const unsigned char *personal);
SODIUM_EXPORT
int crypto_generichash_blake2b_update(crypto_generichash_blake2b_state *state,
const unsigned char *in,
unsigned long long inlen);
SODIUM_EXPORT
int crypto_generichash_blake2b_final(crypto_generichash_blake2b_state *state,
unsigned char *out,
const size_t outlen);
SODIUM_EXPORT
void crypto_generichash_blake2b_keygen(unsigned char k[crypto_generichash_blake2b_KEYBYTES]);
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_personalbytes(void);
SODIUM_EXPORT
size_t
crypto_generichash_blake2b_statebytes(void);
SODIUM_EXPORT
int
crypto_generichash_blake2b(unsigned char *out, size_t outlen,
const unsigned char *in, unsigned long long inlen,
const unsigned char *key, size_t keylen);
SODIUM_EXPORT
int
crypto_generichash_blake2b_salt_personal(
unsigned char *out, size_t outlen, const unsigned char *in,
unsigned long long inlen, const unsigned char *key, size_t keylen,
const unsigned char *salt, const unsigned char *personal);
SODIUM_EXPORT
int
crypto_generichash_blake2b_init(crypto_generichash_blake2b_state *state,
const unsigned char *key, const size_t keylen,
const size_t outlen);
SODIUM_EXPORT
int
crypto_generichash_blake2b_init_salt_personal(
crypto_generichash_blake2b_state *state, const unsigned char *key,
const size_t keylen, const size_t outlen, const unsigned char *salt,
const unsigned char *personal);
SODIUM_EXPORT
int
crypto_generichash_blake2b_update(crypto_generichash_blake2b_state *state,
const unsigned char *in,
unsigned long long inlen);
SODIUM_EXPORT
int
crypto_generichash_blake2b_final(crypto_generichash_blake2b_state *state,
unsigned char *out, const size_t outlen);
SODIUM_EXPORT
void
crypto_generichash_blake2b_keygen(
unsigned char k[crypto_generichash_blake2b_KEYBYTES]);
#ifdef __cplusplus
}

@ -7,36 +7,41 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#define crypto_scalarmult_BYTES crypto_scalarmult_curve25519_BYTES
SODIUM_EXPORT
size_t crypto_scalarmult_bytes(void);
SODIUM_EXPORT
size_t
crypto_scalarmult_bytes(void);
#define crypto_scalarmult_SCALARBYTES crypto_scalarmult_curve25519_SCALARBYTES
SODIUM_EXPORT
size_t crypto_scalarmult_scalarbytes(void);
SODIUM_EXPORT
size_t
crypto_scalarmult_scalarbytes(void);
#define crypto_scalarmult_PRIMITIVE "curve25519"
SODIUM_EXPORT
const char *crypto_scalarmult_primitive(void);
SODIUM_EXPORT
int crypto_scalarmult_base(unsigned char *q, const unsigned char *n);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int crypto_scalarmult(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
const char *
crypto_scalarmult_primitive(void);
SODIUM_EXPORT
int
crypto_scalarmult_base(unsigned char *q, const unsigned char *n);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int
crypto_scalarmult(unsigned char *q, const unsigned char *n,
const unsigned char *p) __attribute__((warn_unused_result));
#ifdef __cplusplus
}

@ -6,32 +6,37 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#define crypto_scalarmult_curve25519_BYTES 32U
SODIUM_EXPORT
size_t crypto_scalarmult_curve25519_bytes(void);
SODIUM_EXPORT
size_t
crypto_scalarmult_curve25519_bytes(void);
#define crypto_scalarmult_curve25519_SCALARBYTES 32U
SODIUM_EXPORT
size_t crypto_scalarmult_curve25519_scalarbytes(void);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int crypto_scalarmult_curve25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_scalarmult_curve25519_base(unsigned char *q, const unsigned char *n);
SODIUM_EXPORT
size_t
crypto_scalarmult_curve25519_scalarbytes(void);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int
crypto_scalarmult_curve25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_scalarmult_curve25519_base(unsigned char *q, const unsigned char *n);
#ifdef __cplusplus
}

@ -7,32 +7,37 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#define crypto_scalarmult_ed25519_BYTES 32U
SODIUM_EXPORT
size_t crypto_scalarmult_ed25519_bytes(void);
SODIUM_EXPORT
size_t
crypto_scalarmult_ed25519_bytes(void);
#define crypto_scalarmult_ed25519_SCALARBYTES 32U
SODIUM_EXPORT
size_t crypto_scalarmult_ed25519_scalarbytes(void);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int crypto_scalarmult_ed25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_scalarmult_ed25519_base(unsigned char *q, const unsigned char *n);
SODIUM_EXPORT
size_t
crypto_scalarmult_ed25519_scalarbytes(void);
/*
* NOTE: Do not use the result of this function directly.
*
* Hash the result with the public keys in order to compute a shared
* secret key: H(q || client_pk || server_pk)
*
* Or unless this is not an option, use the crypto_kx() API instead.
*/
SODIUM_EXPORT
int
crypto_scalarmult_ed25519(unsigned char *q, const unsigned char *n,
const unsigned char *p)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_scalarmult_ed25519_base(unsigned char *q, const unsigned char *n);
#ifdef __cplusplus
}

@ -14,87 +14,102 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
typedef crypto_sign_ed25519ph_state crypto_sign_state;
typedef crypto_sign_ed25519ph_state crypto_sign_state;
SODIUM_EXPORT
size_t crypto_sign_statebytes(void);
SODIUM_EXPORT
size_t
crypto_sign_statebytes(void);
#define crypto_sign_BYTES crypto_sign_ed25519_BYTES
SODIUM_EXPORT
size_t crypto_sign_bytes(void);
SODIUM_EXPORT
size_t
crypto_sign_bytes(void);
#define crypto_sign_SEEDBYTES crypto_sign_ed25519_SEEDBYTES
SODIUM_EXPORT
size_t crypto_sign_seedbytes(void);
SODIUM_EXPORT
size_t
crypto_sign_seedbytes(void);
#define crypto_sign_PUBLICKEYBYTES crypto_sign_ed25519_PUBLICKEYBYTES
SODIUM_EXPORT
size_t crypto_sign_publickeybytes(void);
SODIUM_EXPORT
size_t
crypto_sign_publickeybytes(void);
#define crypto_sign_SECRETKEYBYTES crypto_sign_ed25519_SECRETKEYBYTES
SODIUM_EXPORT
size_t crypto_sign_secretkeybytes(void);
SODIUM_EXPORT
size_t
crypto_sign_secretkeybytes(void);
#define crypto_sign_MESSAGEBYTES_MAX crypto_sign_ed25519_MESSAGEBYTES_MAX
SODIUM_EXPORT
size_t crypto_sign_messagebytes_max(void);
SODIUM_EXPORT
size_t
crypto_sign_messagebytes_max(void);
#define crypto_sign_PRIMITIVE "ed25519"
SODIUM_EXPORT
const char *crypto_sign_primitive(void);
SODIUM_EXPORT
int crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int crypto_sign(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_sign_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_sign_init(crypto_sign_state *state);
SODIUM_EXPORT
int crypto_sign_update(crypto_sign_state *state,
const unsigned char *m, unsigned long long mlen);
SODIUM_EXPORT
int crypto_sign_final_create(crypto_sign_state *state, unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_final_verify(crypto_sign_state *state, unsigned char *sig,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
const char *
crypto_sign_primitive(void);
SODIUM_EXPORT
int
crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int
crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk) __attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_sign_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_verify_detached(const unsigned char *sig, const unsigned char *m,
unsigned long long mlen, const unsigned char *pk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_sign_init(crypto_sign_state *state);
SODIUM_EXPORT
int
crypto_sign_update(crypto_sign_state *state, const unsigned char *m,
unsigned long long mlen);
SODIUM_EXPORT
int
crypto_sign_final_create(crypto_sign_state *state, unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_final_verify(crypto_sign_state *state, unsigned char *sig,
const unsigned char *pk)
__attribute__((warn_unused_result));
#ifdef __cplusplus
}

@ -6,106 +6,125 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
typedef struct crypto_sign_ed25519ph_state {
typedef struct crypto_sign_ed25519ph_state
{
crypto_hash_sha512_state hs;
} crypto_sign_ed25519ph_state;
} crypto_sign_ed25519ph_state;
SODIUM_EXPORT
size_t crypto_sign_ed25519ph_statebytes(void);
SODIUM_EXPORT
size_t
crypto_sign_ed25519ph_statebytes(void);
#define crypto_sign_ed25519_BYTES 64U
SODIUM_EXPORT
size_t crypto_sign_ed25519_bytes(void);
SODIUM_EXPORT
size_t
crypto_sign_ed25519_bytes(void);
#define crypto_sign_ed25519_SEEDBYTES 32U
SODIUM_EXPORT
size_t crypto_sign_ed25519_seedbytes(void);
SODIUM_EXPORT
size_t
crypto_sign_ed25519_seedbytes(void);
#define crypto_sign_ed25519_PUBLICKEYBYTES 32U
SODIUM_EXPORT
size_t crypto_sign_ed25519_publickeybytes(void);
SODIUM_EXPORT
size_t
crypto_sign_ed25519_publickeybytes(void);
#define crypto_sign_ed25519_SECRETKEYBYTES (32U + 32U)
SODIUM_EXPORT
size_t crypto_sign_ed25519_secretkeybytes(void);
#define crypto_sign_ed25519_MESSAGEBYTES_MAX (SODIUM_SIZE_MAX - crypto_sign_ed25519_BYTES)
SODIUM_EXPORT
size_t crypto_sign_ed25519_messagebytes_max(void);
SODIUM_EXPORT
int crypto_sign_ed25519(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_sign_ed25519_detached(unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_sign_ed25519_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int crypto_sign_ed25519_pk_to_curve25519(unsigned char *curve25519_pk,
const unsigned char *ed25519_pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int crypto_sign_ed25519_sk_to_curve25519(unsigned char *curve25519_sk,
const unsigned char *ed25519_sk);
SODIUM_EXPORT
int crypto_sign_ed25519_sk_to_seed(unsigned char *seed,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519_sk_to_pk(unsigned char *pk, const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519ph_init(crypto_sign_ed25519ph_state *state);
SODIUM_EXPORT
int crypto_sign_ed25519ph_update(crypto_sign_ed25519ph_state *state,
const unsigned char *m,
unsigned long long mlen);
SODIUM_EXPORT
int crypto_sign_ed25519ph_final_create(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
SODIUM_EXPORT
int crypto_sign_ed25519ph_final_verify(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
const unsigned char *pk)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
size_t
crypto_sign_ed25519_secretkeybytes(void);
#define crypto_sign_ed25519_MESSAGEBYTES_MAX \
(SODIUM_SIZE_MAX - crypto_sign_ed25519_BYTES)
SODIUM_EXPORT
size_t
crypto_sign_ed25519_messagebytes_max(void);
SODIUM_EXPORT
int
crypto_sign_ed25519(unsigned char *sm, unsigned long long *smlen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519_open(unsigned char *m, unsigned long long *mlen_p,
const unsigned char *sm, unsigned long long smlen,
const unsigned char *pk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_sign_ed25519_detached(unsigned char *sig, unsigned long long *siglen_p,
const unsigned char *m, unsigned long long mlen,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519_verify_detached(const unsigned char *sig,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *pk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_sign_ed25519_keypair(unsigned char *pk, unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519_seed_keypair(unsigned char *pk, unsigned char *sk,
const unsigned char *seed);
SODIUM_EXPORT
int
crypto_sign_ed25519_pk_to_curve25519(unsigned char *curve25519_pk,
const unsigned char *ed25519_pk)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
crypto_sign_ed25519_sk_to_curve25519(unsigned char *curve25519_sk,
const unsigned char *ed25519_sk);
SODIUM_EXPORT
int
crypto_sign_ed25519_sk_to_seed(unsigned char *seed, const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519_sk_to_pk(unsigned char *pk, const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519ph_init(crypto_sign_ed25519ph_state *state);
SODIUM_EXPORT
int
crypto_sign_ed25519ph_update(crypto_sign_ed25519ph_state *state,
const unsigned char *m, unsigned long long mlen);
SODIUM_EXPORT
int
crypto_sign_ed25519ph_final_create(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
unsigned long long *siglen_p,
const unsigned char *sk);
SODIUM_EXPORT
int
crypto_sign_ed25519ph_final_verify(crypto_sign_ed25519ph_state *state,
unsigned char *sig,
const unsigned char *pk)
__attribute__((warn_unused_result));
#ifdef __cplusplus
}

@ -16,37 +16,42 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_sign_edwards25519sha512batch_BYTES 64U
#define crypto_sign_edwards25519sha512batch_PUBLICKEYBYTES 32U
#define crypto_sign_edwards25519sha512batch_SECRETKEYBYTES (32U + 32U)
#define crypto_sign_edwards25519sha512batch_MESSAGEBYTES_MAX (SODIUM_SIZE_MAX - crypto_sign_edwards25519sha512batch_BYTES)
SODIUM_EXPORT
int crypto_sign_edwards25519sha512batch(unsigned char *sm,
unsigned long long *smlen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk)
__attribute__ ((deprecated));
SODIUM_EXPORT
int crypto_sign_edwards25519sha512batch_open(unsigned char *m,
unsigned long long *mlen_p,
const unsigned char *sm,
unsigned long long smlen,
const unsigned char *pk)
__attribute__ ((deprecated));
SODIUM_EXPORT
int crypto_sign_edwards25519sha512batch_keypair(unsigned char *pk,
unsigned char *sk)
__attribute__ ((deprecated));
#define crypto_sign_edwards25519sha512batch_MESSAGEBYTES_MAX \
(SODIUM_SIZE_MAX - crypto_sign_edwards25519sha512batch_BYTES)
SODIUM_EXPORT
int
crypto_sign_edwards25519sha512batch(unsigned char *sm,
unsigned long long *smlen_p,
const unsigned char *m,
unsigned long long mlen,
const unsigned char *sk)
__attribute__((deprecated));
SODIUM_EXPORT
int
crypto_sign_edwards25519sha512batch_open(unsigned char *m,
unsigned long long *mlen_p,
const unsigned char *sm,
unsigned long long smlen,
const unsigned char *pk)
__attribute__((deprecated));
SODIUM_EXPORT
int
crypto_sign_edwards25519sha512batch_keypair(unsigned char *pk,
unsigned char *sk)
__attribute__((deprecated));
#ifdef __cplusplus
}

@ -14,82 +14,103 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_stream_chacha20_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_stream_chacha20_keybytes(void);
SODIUM_EXPORT
size_t
crypto_stream_chacha20_keybytes(void);
#define crypto_stream_chacha20_NONCEBYTES 8U
SODIUM_EXPORT
size_t crypto_stream_chacha20_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_stream_chacha20_noncebytes(void);
#define crypto_stream_chacha20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
SODIUM_EXPORT
size_t crypto_stream_chacha20_messagebytes_max(void);
SODIUM_EXPORT
size_t
crypto_stream_chacha20_messagebytes_max(void);
/* ChaCha20 with a 64-bit nonce and a 64-bit counter, as originally designed */
/* ChaCha20 with a 64-bit nonce and a 64-bit counter, as originally designed
*/
SODIUM_EXPORT
int crypto_stream_chacha20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_chacha20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
SODIUM_EXPORT
void crypto_stream_chacha20_keygen(unsigned char k[crypto_stream_chacha20_KEYBYTES]);
SODIUM_EXPORT
void
crypto_stream_chacha20_keygen(
unsigned char k[crypto_stream_chacha20_KEYBYTES]);
/* ChaCha20 with a 96-bit nonce and a 32-bit counter (IETF) */
/* ChaCha20 with a 96-bit nonce and a 32-bit counter (IETF) */
#define crypto_stream_chacha20_ietf_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_stream_chacha20_ietf_keybytes(void);
SODIUM_EXPORT
size_t
crypto_stream_chacha20_ietf_keybytes(void);
#define crypto_stream_chacha20_ietf_NONCEBYTES 12U
SODIUM_EXPORT
size_t crypto_stream_chacha20_ietf_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_stream_chacha20_ietf_noncebytes(void);
#define crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX \
SODIUM_MIN(SODIUM_SIZE_MAX, 64ULL * (1ULL << 32))
SODIUM_EXPORT
size_t crypto_stream_chacha20_ietf_messagebytes_max(void);
SODIUM_EXPORT
int crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k);
SODIUM_EXPORT
void crypto_stream_chacha20_ietf_keygen(unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES]);
/* Aliases */
SODIUM_MIN(SODIUM_SIZE_MAX, 64ULL * (1ULL << 32))
SODIUM_EXPORT
size_t
crypto_stream_chacha20_ietf_messagebytes_max(void);
SODIUM_EXPORT
int
crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
const unsigned char *k);
#define crypto_stream_chacha20_IETF_KEYBYTES crypto_stream_chacha20_ietf_KEYBYTES
#define crypto_stream_chacha20_IETF_NONCEBYTES crypto_stream_chacha20_ietf_NONCEBYTES
#define crypto_stream_chacha20_IETF_MESSAGEBYTES_MAX crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX
SODIUM_EXPORT
int
crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint32_t ic,
const unsigned char *k);
SODIUM_EXPORT
void
crypto_stream_chacha20_ietf_keygen(
unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES]);
/* Aliases */
#define crypto_stream_chacha20_IETF_KEYBYTES \
crypto_stream_chacha20_ietf_KEYBYTES
#define crypto_stream_chacha20_IETF_NONCEBYTES \
crypto_stream_chacha20_ietf_NONCEBYTES
#define crypto_stream_chacha20_IETF_MESSAGEBYTES_MAX \
crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX
#ifdef __cplusplus
}

@ -14,41 +14,48 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_stream_salsa20_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_stream_salsa20_keybytes(void);
SODIUM_EXPORT
size_t
crypto_stream_salsa20_keybytes(void);
#define crypto_stream_salsa20_NONCEBYTES 8U
SODIUM_EXPORT
size_t crypto_stream_salsa20_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_stream_salsa20_noncebytes(void);
#define crypto_stream_salsa20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
SODIUM_EXPORT
size_t crypto_stream_salsa20_messagebytes_max(void);
SODIUM_EXPORT
int crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
SODIUM_EXPORT
void crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]);
SODIUM_EXPORT
size_t
crypto_stream_salsa20_messagebytes_max(void);
SODIUM_EXPORT
int
crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
SODIUM_EXPORT
void
crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]);
#ifdef __cplusplus
}

@ -14,41 +14,49 @@
#include "export.h"
#ifdef __cplusplus
# ifdef __GNUC__
# pragma GCC diagnostic ignored "-Wlong-long"
# endif
extern "C" {
#ifdef __GNUC__
#pragma GCC diagnostic ignored "-Wlong-long"
#endif
extern "C"
{
#endif
#define crypto_stream_xsalsa20_KEYBYTES 32U
SODIUM_EXPORT
size_t crypto_stream_xsalsa20_keybytes(void);
SODIUM_EXPORT
size_t
crypto_stream_xsalsa20_keybytes(void);
#define crypto_stream_xsalsa20_NONCEBYTES 24U
SODIUM_EXPORT
size_t crypto_stream_xsalsa20_noncebytes(void);
SODIUM_EXPORT
size_t
crypto_stream_xsalsa20_noncebytes(void);
#define crypto_stream_xsalsa20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
SODIUM_EXPORT
size_t crypto_stream_xsalsa20_messagebytes_max(void);
SODIUM_EXPORT
int crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
SODIUM_EXPORT
void crypto_stream_xsalsa20_keygen(unsigned char k[crypto_stream_xsalsa20_KEYBYTES]);
SODIUM_EXPORT
size_t
crypto_stream_xsalsa20_messagebytes_max(void);
SODIUM_EXPORT
int
crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k);
SODIUM_EXPORT
int
crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
SODIUM_EXPORT
void
crypto_stream_xsalsa20_keygen(
unsigned char k[crypto_stream_xsalsa20_KEYBYTES]);
#ifdef __cplusplus
}

@ -7,48 +7,48 @@
#include <limits.h>
#if !defined(__clang__) && !defined(__GNUC__)
# ifdef __attribute__
# undef __attribute__
# endif
# define __attribute__(a)
#ifdef __attribute__
#undef __attribute__
#endif
#define __attribute__(a)
#endif
#ifdef SODIUM_STATIC
# define SODIUM_EXPORT
# define SODIUM_EXPORT_WEAK
#define SODIUM_EXPORT
#define SODIUM_EXPORT_WEAK
#else
#if defined(_MSC_VER)
#ifdef SODIUM_DLL_EXPORT
#define SODIUM_EXPORT __declspec(dllexport)
#else
#define SODIUM_EXPORT __declspec(dllimport)
#endif
#else
#if defined(__SUNPRO_C)
#ifndef __GNU_C__
#define SODIUM_EXPORT __attribute__(visibility(__global))
#else
#define SODIUM_EXPORT __attribute__ __global
#endif
#elif defined(_MSG_VER)
#define SODIUM_EXPORT extern __declspec(dllexport)
#else
# if defined(_MSC_VER)
# ifdef SODIUM_DLL_EXPORT
# define SODIUM_EXPORT __declspec(dllexport)
# else
# define SODIUM_EXPORT __declspec(dllimport)
# endif
# else
# if defined(__SUNPRO_C)
# ifndef __GNU_C__
# define SODIUM_EXPORT __attribute__ (visibility(__global))
# else
# define SODIUM_EXPORT __attribute__ __global
# endif
# elif defined(_MSG_VER)
# define SODIUM_EXPORT extern __declspec(dllexport)
# else
# define SODIUM_EXPORT __attribute__ ((visibility ("default")))
# endif
# endif
# if defined(__ELF__) && !defined(SODIUM_DISABLE_WEAK_FUNCTIONS)
# define SODIUM_EXPORT_WEAK SODIUM_EXPORT __attribute__((weak))
# else
# define SODIUM_EXPORT_WEAK SODIUM_EXPORT
# endif
#define SODIUM_EXPORT __attribute__((visibility("default")))
#endif
#endif
#if defined(__ELF__) && !defined(SODIUM_DISABLE_WEAK_FUNCTIONS)
#define SODIUM_EXPORT_WEAK SODIUM_EXPORT __attribute__((weak))
#else
#define SODIUM_EXPORT_WEAK SODIUM_EXPORT
#endif
#endif
#ifndef CRYPTO_ALIGN
# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
# define CRYPTO_ALIGN(x) __declspec(align(x))
# else
# define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
# endif
#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
#define CRYPTO_ALIGN(x) __declspec(align(x))
#else
#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif
#define SODIUM_MIN(A, B) ((A) < (B) ? (A) : (B))

@ -5,42 +5,42 @@
#include <stdlib.h>
#include <string.h>
#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
#define COMPILER_ASSERT(X) (void)sizeof(char[(X) ? 1 : -1])
#ifdef HAVE_TI_MODE
# if defined(__SIZEOF_INT128__)
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
# else
#else
typedef unsigned uint128_t __attribute__((mode(TI)));
# endif
#endif
#endif
#define ROTL32(X, B) rotl32((X), (B))
static inline uint32_t
rotl32(const uint32_t x, const int b)
{
return (x << b) | (x >> (32 - b));
return (x << b) | (x >> (32 - b));
}
#define ROTL64(X, B) rotl64((X), (B))
static inline uint64_t
rotl64(const uint64_t x, const int b)
{
return (x << b) | (x >> (64 - b));
return (x << b) | (x >> (64 - b));
}
#define ROTR32(X, B) rotr32((X), (B))
static inline uint32_t
rotr32(const uint32_t x, const int b)
{
return (x >> b) | (x << (32 - b));
return (x >> b) | (x << (32 - b));
}
#define ROTR64(X, B) rotr64((X), (B))
static inline uint64_t
rotr64(const uint64_t x, const int b)
{
return (x >> b) | (x << (64 - b));
return (x >> b) | (x << (64 - b));
}
#define LOAD64_LE(SRC) load64_le(SRC)
@ -48,19 +48,19 @@ static inline uint64_t
load64_le(const uint8_t src[8])
{
#ifdef NATIVE_LITTLE_ENDIAN
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint64_t w = (uint64_t) src[0];
w |= (uint64_t) src[1] << 8;
w |= (uint64_t) src[2] << 16;
w |= (uint64_t) src[3] << 24;
w |= (uint64_t) src[4] << 32;
w |= (uint64_t) src[5] << 40;
w |= (uint64_t) src[6] << 48;
w |= (uint64_t) src[7] << 56;
return w;
uint64_t w = (uint64_t)src[0];
w |= (uint64_t)src[1] << 8;
w |= (uint64_t)src[2] << 16;
w |= (uint64_t)src[3] << 24;
w |= (uint64_t)src[4] << 32;
w |= (uint64_t)src[5] << 40;
w |= (uint64_t)src[6] << 48;
w |= (uint64_t)src[7] << 56;
return w;
#endif
}
@ -69,16 +69,23 @@ static inline void
store64_le(uint8_t dst[8], uint64_t w)
{
#ifdef NATIVE_LITTLE_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[7] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[7] = (uint8_t)w;
#endif
}
@ -87,15 +94,15 @@ static inline uint32_t
load32_le(const uint8_t src[4])
{
#ifdef NATIVE_LITTLE_ENDIAN
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint32_t w = (uint32_t) src[0];
w |= (uint32_t) src[1] << 8;
w |= (uint32_t) src[2] << 16;
w |= (uint32_t) src[3] << 24;
return w;
uint32_t w = (uint32_t)src[0];
w |= (uint32_t)src[1] << 8;
w |= (uint32_t)src[2] << 16;
w |= (uint32_t)src[3] << 24;
return w;
#endif
}
@ -104,12 +111,15 @@ static inline void
store32_le(uint8_t dst[4], uint32_t w)
{
#ifdef NATIVE_LITTLE_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[0] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w;
dst[0] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
#endif
}
@ -120,19 +130,19 @@ static inline uint64_t
load64_be(const uint8_t src[8])
{
#ifdef NATIVE_BIG_ENDIAN
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
uint64_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint64_t w = (uint64_t) src[7];
w |= (uint64_t) src[6] << 8;
w |= (uint64_t) src[5] << 16;
w |= (uint64_t) src[4] << 24;
w |= (uint64_t) src[3] << 32;
w |= (uint64_t) src[2] << 40;
w |= (uint64_t) src[1] << 48;
w |= (uint64_t) src[0] << 56;
return w;
uint64_t w = (uint64_t)src[7];
w |= (uint64_t)src[6] << 8;
w |= (uint64_t)src[5] << 16;
w |= (uint64_t)src[4] << 24;
w |= (uint64_t)src[3] << 32;
w |= (uint64_t)src[2] << 40;
w |= (uint64_t)src[1] << 48;
w |= (uint64_t)src[0] << 56;
return w;
#endif
}
@ -141,16 +151,23 @@ static inline void
store64_be(uint8_t dst[8], uint64_t w)
{
#ifdef NATIVE_BIG_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[7] = (uint8_t) w; w >>= 8;
dst[6] = (uint8_t) w; w >>= 8;
dst[5] = (uint8_t) w; w >>= 8;
dst[4] = (uint8_t) w; w >>= 8;
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[7] = (uint8_t)w;
w >>= 8;
dst[6] = (uint8_t)w;
w >>= 8;
dst[5] = (uint8_t)w;
w >>= 8;
dst[4] = (uint8_t)w;
w >>= 8;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
#endif
}
@ -159,15 +176,15 @@ static inline uint32_t
load32_be(const uint8_t src[4])
{
#ifdef NATIVE_BIG_ENDIAN
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
uint32_t w;
memcpy(&w, src, sizeof w);
return w;
#else
uint32_t w = (uint32_t) src[3];
w |= (uint32_t) src[2] << 8;
w |= (uint32_t) src[1] << 16;
w |= (uint32_t) src[0] << 24;
return w;
uint32_t w = (uint32_t)src[3];
w |= (uint32_t)src[2] << 8;
w |= (uint32_t)src[1] << 16;
w |= (uint32_t)src[0] << 24;
return w;
#endif
}
@ -176,12 +193,15 @@ static inline void
store32_be(uint8_t dst[4], uint32_t w)
{
#ifdef NATIVE_BIG_ENDIAN
memcpy(dst, &w, sizeof w);
memcpy(dst, &w, sizeof w);
#else
dst[3] = (uint8_t) w; w >>= 8;
dst[2] = (uint8_t) w; w >>= 8;
dst[1] = (uint8_t) w; w >>= 8;
dst[0] = (uint8_t) w;
dst[3] = (uint8_t)w;
w >>= 8;
dst[2] = (uint8_t)w;
w >>= 8;
dst[1] = (uint8_t)w;
w >>= 8;
dst[0] = (uint8_t)w;
#endif
}
@ -189,58 +209,61 @@ store32_be(uint8_t dst[4], uint32_t w)
static inline void
xor_buf(unsigned char *out, const unsigned char *in, size_t n)
{
size_t i;
size_t i;
for (i = 0; i < n; i++) {
out[i] ^= in[i];
}
for(i = 0; i < n; i++)
{
out[i] ^= in[i];
}
}
#if !defined(__clang__) && !defined(__GNUC__)
# ifdef __attribute__
# undef __attribute__
# endif
# define __attribute__(a)
#ifdef __attribute__
#undef __attribute__
#endif
#define __attribute__(a)
#endif
#ifndef CRYPTO_ALIGN
# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
# define CRYPTO_ALIGN(x) __declspec(align(x))
# else
# define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
# endif
#endif
#if defined(_MSC_VER) && \
(defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
# include <intrin.h>
# define HAVE_INTRIN_H 1
# define HAVE_MMINTRIN_H 1
# define HAVE_EMMINTRIN_H 1
# define HAVE_PMMINTRIN_H 1
# define HAVE_TMMINTRIN_H 1
# define HAVE_SMMINTRIN_H 1
# define HAVE_AVXINTRIN_H 1
# if _MSC_VER >= 1600
# define HAVE_WMMINTRIN_H 1
# endif
# if _MSC_VER >= 1700 && defined(_M_X64)
# define HAVE_AVX2INTRIN_H 1
# endif
#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
#define CRYPTO_ALIGN(x) __declspec(align(x))
#else
#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif
#if defined(_MSC_VER) \
&& (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
#include <intrin.h>
#define HAVE_INTRIN_H 1
#define HAVE_MMINTRIN_H 1
#define HAVE_EMMINTRIN_H 1
#define HAVE_PMMINTRIN_H 1
#define HAVE_TMMINTRIN_H 1
#define HAVE_SMMINTRIN_H 1
#define HAVE_AVXINTRIN_H 1
#if _MSC_VER >= 1600
#define HAVE_WMMINTRIN_H 1
#endif
#if _MSC_VER >= 1700 && defined(_M_X64)
#define HAVE_AVX2INTRIN_H 1
#endif
#elif defined(HAVE_INTRIN_H)
# include <intrin.h>
#include <intrin.h>
#endif
#ifdef HAVE_LIBCTGRIND
extern void ct_poison (const void *, size_t);
extern void ct_unpoison(const void *, size_t);
# define POISON(X, L) ct_poison((X), (L))
# define UNPOISON(X, L) ct_unpoison((X), (L))
extern void
ct_poison(const void *, size_t);
extern void
ct_unpoison(const void *, size_t);
#define POISON(X, L) ct_poison((X), (L))
#define UNPOISON(X, L) ct_unpoison((X), (L))
#else
# define POISON(X, L) (void) 0
# define UNPOISON(X, L) (void) 0
#define POISON(X, L) (void)0
#define UNPOISON(X, L) (void)0
#endif
#endif

@ -15,17 +15,19 @@ typedef uint64_t fe25519[5];
typedef int32_t fe25519[10];
#endif
void fe25519_invert(fe25519 out, const fe25519 z);
void fe25519_frombytes(fe25519 h, const unsigned char *s);
void fe25519_tobytes(unsigned char *s, const fe25519 h);
void
fe25519_invert(fe25519 out, const fe25519 z);
void
fe25519_frombytes(fe25519 h, const unsigned char *s);
void
fe25519_tobytes(unsigned char *s, const fe25519 h);
#ifdef HAVE_TI_MODE
# include "ed25519_ref10_fe_51.h"
#include "ed25519_ref10_fe_51.h"
#else
# include "ed25519_ref10_fe_25_5.h"
#include "ed25519_ref10_fe_25_5.h"
#endif
/*
ge means group element.
@ -40,86 +42,109 @@ void fe25519_tobytes(unsigned char *s, const fe25519 h);
ge25519_precomp (Duif): (y+x,y-x,2dxy)
*/
typedef struct {
fe25519 X;
fe25519 Y;
fe25519 Z;
typedef struct
{
fe25519 X;
fe25519 Y;
fe25519 Z;
} ge25519_p2;
typedef struct {
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
typedef struct
{
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
} ge25519_p3;
typedef struct {
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
typedef struct
{
fe25519 X;
fe25519 Y;
fe25519 Z;
fe25519 T;
} ge25519_p1p1;
typedef struct {
fe25519 yplusx;
fe25519 yminusx;
fe25519 xy2d;
typedef struct
{
fe25519 yplusx;
fe25519 yminusx;
fe25519 xy2d;
} ge25519_precomp;
typedef struct {
fe25519 YplusX;
fe25519 YminusX;
fe25519 Z;
fe25519 T2d;
typedef struct
{
fe25519 YplusX;
fe25519 YminusX;
fe25519 Z;
fe25519 T2d;
} ge25519_cached;
void ge25519_tobytes(unsigned char *s, const ge25519_p2 *h);
void
ge25519_tobytes(unsigned char *s, const ge25519_p2 *h);
void ge25519_p3_tobytes(unsigned char *s, const ge25519_p3 *h);
void
ge25519_p3_tobytes(unsigned char *s, const ge25519_p3 *h);
int ge25519_frombytes(ge25519_p3 *h, const unsigned char *s);
int
ge25519_frombytes(ge25519_p3 *h, const unsigned char *s);
int ge25519_frombytes_negate_vartime(ge25519_p3 *h, const unsigned char *s);
int
ge25519_frombytes_negate_vartime(ge25519_p3 *h, const unsigned char *s);
void ge25519_p3_to_cached(ge25519_cached *r, const ge25519_p3 *p);
void
ge25519_p3_to_cached(ge25519_cached *r, const ge25519_p3 *p);
void ge25519_p1p1_to_p2(ge25519_p2 *r, const ge25519_p1p1 *p);
void
ge25519_p1p1_to_p2(ge25519_p2 *r, const ge25519_p1p1 *p);
void ge25519_p1p1_to_p3(ge25519_p3 *r, const ge25519_p1p1 *p);
void
ge25519_p1p1_to_p3(ge25519_p3 *r, const ge25519_p1p1 *p);
void ge25519_add(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
void
ge25519_add(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
void ge25519_sub(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
void
ge25519_sub(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
void ge25519_scalarmult_base(ge25519_p3 *h, const unsigned char *a);
void
ge25519_scalarmult_base(ge25519_p3 *h, const unsigned char *a);
void ge25519_double_scalarmult_vartime(ge25519_p2 *r, const unsigned char *a,
const ge25519_p3 *A,
const unsigned char *b);
void
ge25519_double_scalarmult_vartime(ge25519_p2 *r, const unsigned char *a,
const ge25519_p3 *A, const unsigned char *b);
void ge25519_scalarmult(ge25519_p3 *h, const unsigned char *a,
const ge25519_p3 *p);
void
ge25519_scalarmult(ge25519_p3 *h, const unsigned char *a, const ge25519_p3 *p);
int ge25519_is_canonical(const unsigned char *s);
int
ge25519_is_canonical(const unsigned char *s);
int ge25519_is_on_curve(const ge25519_p3 *p);
int
ge25519_is_on_curve(const ge25519_p3 *p);
int ge25519_is_on_main_subgroup(const ge25519_p3 *p);
int
ge25519_is_on_main_subgroup(const ge25519_p3 *p);
int ge25519_has_small_order(const unsigned char s[32]);
int
ge25519_has_small_order(const unsigned char s[32]);
void ge25519_from_uniform(unsigned char s[32], const unsigned char r[32]);
void
ge25519_from_uniform(unsigned char s[32], const unsigned char r[32]);
/*
The set of scalars is \Z/l
where l = 2^252 + 27742317777372353535851937790883648493.
*/
void sc25519_reduce(unsigned char *s);
void
sc25519_reduce(unsigned char *s);
void sc25519_muladd(unsigned char *s, const unsigned char *a,
const unsigned char *b, const unsigned char *c);
void
sc25519_muladd(unsigned char *s, const unsigned char *a, const unsigned char *b,
const unsigned char *c);
int sc25519_is_canonical(const unsigned char *s);
int
sc25519_is_canonical(const unsigned char *s);
#endif

@ -10,7 +10,7 @@
static inline void
fe25519_0(fe25519 h)
{
memset(&h[0], 0, 5 * sizeof h[0]);
memset(&h[0], 0, 5 * sizeof h[0]);
}
/*
@ -20,8 +20,8 @@ fe25519_0(fe25519 h)
static inline void
fe25519_1(fe25519 h)
{
h[0] = 1;
memset(&h[1], 0, 4 * sizeof h[0]);
h[0] = 1;
memset(&h[1], 0, 4 * sizeof h[0]);
}
/*
@ -32,17 +32,17 @@ fe25519_1(fe25519 h)
static inline void
fe25519_add(fe25519 h, const fe25519 f, const fe25519 g)
{
uint64_t h0 = f[0] + g[0];
uint64_t h1 = f[1] + g[1];
uint64_t h2 = f[2] + g[2];
uint64_t h3 = f[3] + g[3];
uint64_t h4 = f[4] + g[4];
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
uint64_t h0 = f[0] + g[0];
uint64_t h1 = f[1] + g[1];
uint64_t h2 = f[2] + g[2];
uint64_t h3 = f[3] + g[3];
uint64_t h4 = f[4] + g[4];
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
}
/*
@ -52,37 +52,37 @@ fe25519_add(fe25519 h, const fe25519 f, const fe25519 g)
static void
fe25519_sub(fe25519 h, const fe25519 f, const fe25519 g)
{
const uint64_t mask = 0x7ffffffffffffULL;
uint64_t h0, h1, h2, h3, h4;
h0 = g[0];
h1 = g[1];
h2 = g[2];
h3 = g[3];
h4 = g[4];
h1 += h0 >> 51;
h0 &= mask;
h2 += h1 >> 51;
h1 &= mask;
h3 += h2 >> 51;
h2 &= mask;
h4 += h3 >> 51;
h3 &= mask;
h0 += 19ULL * (h4 >> 51);
h4 &= mask;
h0 = (f[0] + 0xfffffffffffdaULL) - h0;
h1 = (f[1] + 0xffffffffffffeULL) - h1;
h2 = (f[2] + 0xffffffffffffeULL) - h2;
h3 = (f[3] + 0xffffffffffffeULL) - h3;
h4 = (f[4] + 0xffffffffffffeULL) - h4;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
const uint64_t mask = 0x7ffffffffffffULL;
uint64_t h0, h1, h2, h3, h4;
h0 = g[0];
h1 = g[1];
h2 = g[2];
h3 = g[3];
h4 = g[4];
h1 += h0 >> 51;
h0 &= mask;
h2 += h1 >> 51;
h1 &= mask;
h3 += h2 >> 51;
h2 &= mask;
h4 += h3 >> 51;
h3 &= mask;
h0 += 19ULL * (h4 >> 51);
h4 &= mask;
h0 = (f[0] + 0xfffffffffffdaULL) - h0;
h1 = (f[1] + 0xffffffffffffeULL) - h1;
h2 = (f[2] + 0xffffffffffffeULL) - h2;
h3 = (f[3] + 0xffffffffffffeULL) - h3;
h4 = (f[4] + 0xffffffffffffeULL) - h4;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
}
/*
@ -92,10 +92,10 @@ fe25519_sub(fe25519 h, const fe25519 f, const fe25519 g)
static inline void
fe25519_neg(fe25519 h, const fe25519 f)
{
fe25519 zero;
fe25519 zero;
fe25519_0(zero);
fe25519_sub(h, zero, f);
fe25519_0(zero);
fe25519_sub(h, zero, f);
}
/*
@ -108,31 +108,31 @@ fe25519_neg(fe25519 h, const fe25519 f)
static void
fe25519_cmov(fe25519 f, const fe25519 g, unsigned int b)
{
const uint64_t mask = (uint64_t) (-(int64_t) b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t x0 = f0 ^ g[0];
uint64_t x1 = f1 ^ g[1];
uint64_t x2 = f2 ^ g[2];
uint64_t x3 = f3 ^ g[3];
uint64_t x4 = f4 ^ g[4];
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
const uint64_t mask = (uint64_t)(-(int64_t)b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t x0 = f0 ^ g[0];
uint64_t x1 = f1 ^ g[1];
uint64_t x2 = f2 ^ g[2];
uint64_t x3 = f3 ^ g[3];
uint64_t x4 = f4 ^ g[4];
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
}
/*
@ -145,43 +145,43 @@ Preconditions: b in {0,1}.
static void
fe25519_cswap(fe25519 f, fe25519 g, unsigned int b)
{
const uint64_t mask = (uint64_t) (-(int64_t) b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t g0 = g[0];
uint64_t g1 = g[1];
uint64_t g2 = g[2];
uint64_t g3 = g[3];
uint64_t g4 = g[4];
uint64_t x0 = f0 ^ g0;
uint64_t x1 = f1 ^ g1;
uint64_t x2 = f2 ^ g2;
uint64_t x3 = f3 ^ g3;
uint64_t x4 = f4 ^ g4;
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
g[0] = g0 ^ x0;
g[1] = g1 ^ x1;
g[2] = g2 ^ x2;
g[3] = g3 ^ x3;
g[4] = g4 ^ x4;
const uint64_t mask = (uint64_t)(-(int64_t)b);
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
uint64_t g0 = g[0];
uint64_t g1 = g[1];
uint64_t g2 = g[2];
uint64_t g3 = g[3];
uint64_t g4 = g[4];
uint64_t x0 = f0 ^ g0;
uint64_t x1 = f1 ^ g1;
uint64_t x2 = f2 ^ g2;
uint64_t x3 = f3 ^ g3;
uint64_t x4 = f4 ^ g4;
x0 &= mask;
x1 &= mask;
x2 &= mask;
x3 &= mask;
x4 &= mask;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
g[0] = g0 ^ x0;
g[1] = g1 ^ x1;
g[2] = g2 ^ x2;
g[3] = g3 ^ x3;
g[4] = g4 ^ x4;
}
/*
@ -191,17 +191,17 @@ fe25519_cswap(fe25519 f, fe25519 g, unsigned int b)
static inline void
fe25519_copy(fe25519 h, const fe25519 f)
{
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
h[0] = f0;
h[1] = f1;
h[2] = f2;
h[3] = f3;
h[4] = f4;
uint64_t f0 = f[0];
uint64_t f1 = f[1];
uint64_t f2 = f[2];
uint64_t f3 = f[3];
uint64_t f4 = f[4];
h[0] = f0;
h[1] = f1;
h[2] = f2;
h[3] = f3;
h[4] = f4;
}
/*
@ -212,11 +212,11 @@ fe25519_copy(fe25519 h, const fe25519 f)
static inline int
fe25519_isnegative(const fe25519 f)
{
unsigned char s[32];
unsigned char s[32];
fe25519_tobytes(s, f);
fe25519_tobytes(s, f);
return s[0] & 1;
return s[0] & 1;
}
/*
@ -227,11 +227,11 @@ fe25519_isnegative(const fe25519 f)
static inline int
fe25519_iszero(const fe25519 f)
{
unsigned char s[32];
unsigned char s[32];
fe25519_tobytes(s, f);
fe25519_tobytes(s, f);
return sodium_is_zero(s, 32);
return sodium_is_zero(s, 32);
}
/*
@ -242,87 +242,87 @@ fe25519_iszero(const fe25519 f)
static void
fe25519_mul(fe25519 h, const fe25519 f, const fe25519 g)
{
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f1_19, f2_19, f3_19, f4_19;
uint64_t g0, g1, g2, g3, g4;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
g0 = g[0];
g1 = g[1];
g2 = g[2];
g3 = g[3];
g4 = g[4];
f1_19 = 19ULL * f1;
f2_19 = 19ULL * f2;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t) f0 ) * ((uint128_t) g0);
r0 += ((uint128_t) f1_19) * ((uint128_t) g4);
r0 += ((uint128_t) f2_19) * ((uint128_t) g3);
r0 += ((uint128_t) f3_19) * ((uint128_t) g2);
r0 += ((uint128_t) f4_19) * ((uint128_t) g1);
r1 = ((uint128_t) f0 ) * ((uint128_t) g1);
r1 += ((uint128_t) f1 ) * ((uint128_t) g0);
r1 += ((uint128_t) f2_19) * ((uint128_t) g4);
r1 += ((uint128_t) f3_19) * ((uint128_t) g3);
r1 += ((uint128_t) f4_19) * ((uint128_t) g2);
r2 = ((uint128_t) f0 ) * ((uint128_t) g2);
r2 += ((uint128_t) f1 ) * ((uint128_t) g1);
r2 += ((uint128_t) f2 ) * ((uint128_t) g0);
r2 += ((uint128_t) f3_19) * ((uint128_t) g4);
r2 += ((uint128_t) f4_19) * ((uint128_t) g3);
r3 = ((uint128_t) f0 ) * ((uint128_t) g3);
r3 += ((uint128_t) f1 ) * ((uint128_t) g2);
r3 += ((uint128_t) f2 ) * ((uint128_t) g1);
r3 += ((uint128_t) f3 ) * ((uint128_t) g0);
r3 += ((uint128_t) f4_19) * ((uint128_t) g4);
r4 = ((uint128_t) f0 ) * ((uint128_t) g4);
r4 += ((uint128_t) f1 ) * ((uint128_t) g3);
r4 += ((uint128_t) f2 ) * ((uint128_t) g2);
r4 += ((uint128_t) f3 ) * ((uint128_t) g1);
r4 += ((uint128_t) f4 ) * ((uint128_t) g0);
r00 = ((uint64_t) r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t) r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t) r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t) r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t) r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t) carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t) carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t) carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f1_19, f2_19, f3_19, f4_19;
uint64_t g0, g1, g2, g3, g4;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
g0 = g[0];
g1 = g[1];
g2 = g[2];
g3 = g[3];
g4 = g[4];
f1_19 = 19ULL * f1;
f2_19 = 19ULL * f2;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t)f0) * ((uint128_t)g0);
r0 += ((uint128_t)f1_19) * ((uint128_t)g4);
r0 += ((uint128_t)f2_19) * ((uint128_t)g3);
r0 += ((uint128_t)f3_19) * ((uint128_t)g2);
r0 += ((uint128_t)f4_19) * ((uint128_t)g1);
r1 = ((uint128_t)f0) * ((uint128_t)g1);
r1 += ((uint128_t)f1) * ((uint128_t)g0);
r1 += ((uint128_t)f2_19) * ((uint128_t)g4);
r1 += ((uint128_t)f3_19) * ((uint128_t)g3);
r1 += ((uint128_t)f4_19) * ((uint128_t)g2);
r2 = ((uint128_t)f0) * ((uint128_t)g2);
r2 += ((uint128_t)f1) * ((uint128_t)g1);
r2 += ((uint128_t)f2) * ((uint128_t)g0);
r2 += ((uint128_t)f3_19) * ((uint128_t)g4);
r2 += ((uint128_t)f4_19) * ((uint128_t)g3);
r3 = ((uint128_t)f0) * ((uint128_t)g3);
r3 += ((uint128_t)f1) * ((uint128_t)g2);
r3 += ((uint128_t)f2) * ((uint128_t)g1);
r3 += ((uint128_t)f3) * ((uint128_t)g0);
r3 += ((uint128_t)f4_19) * ((uint128_t)g4);
r4 = ((uint128_t)f0) * ((uint128_t)g4);
r4 += ((uint128_t)f1) * ((uint128_t)g3);
r4 += ((uint128_t)f2) * ((uint128_t)g2);
r4 += ((uint128_t)f3) * ((uint128_t)g1);
r4 += ((uint128_t)f4) * ((uint128_t)g0);
r00 = ((uint64_t)r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t)r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t)r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t)r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t)r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t)carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t)carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t)carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
}
/*
@ -333,75 +333,75 @@ fe25519_mul(fe25519 h, const fe25519 f, const fe25519 g)
static void
fe25519_sq(fe25519 h, const fe25519 f)
{
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t) f0 ) * ((uint128_t) f0);
r0 += ((uint128_t) f1_38) * ((uint128_t) f4);
r0 += ((uint128_t) f2_38) * ((uint128_t) f3);
r1 = ((uint128_t) f0_2 ) * ((uint128_t) f1);
r1 += ((uint128_t) f2_38) * ((uint128_t) f4);
r1 += ((uint128_t) f3_19) * ((uint128_t) f3);
r2 = ((uint128_t) f0_2 ) * ((uint128_t) f2);
r2 += ((uint128_t) f1 ) * ((uint128_t) f1);
r2 += ((uint128_t) f3_38) * ((uint128_t) f4);
r3 = ((uint128_t) f0_2 ) * ((uint128_t) f3);
r3 += ((uint128_t) f1_2 ) * ((uint128_t) f2);
r3 += ((uint128_t) f4_19) * ((uint128_t) f4);
r4 = ((uint128_t) f0_2 ) * ((uint128_t) f4);
r4 += ((uint128_t) f1_2 ) * ((uint128_t) f3);
r4 += ((uint128_t) f2 ) * ((uint128_t) f2);
r00 = ((uint64_t) r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t) r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t) r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t) r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t) r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t) carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t) carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t) carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t)f0) * ((uint128_t)f0);
r0 += ((uint128_t)f1_38) * ((uint128_t)f4);
r0 += ((uint128_t)f2_38) * ((uint128_t)f3);
r1 = ((uint128_t)f0_2) * ((uint128_t)f1);
r1 += ((uint128_t)f2_38) * ((uint128_t)f4);
r1 += ((uint128_t)f3_19) * ((uint128_t)f3);
r2 = ((uint128_t)f0_2) * ((uint128_t)f2);
r2 += ((uint128_t)f1) * ((uint128_t)f1);
r2 += ((uint128_t)f3_38) * ((uint128_t)f4);
r3 = ((uint128_t)f0_2) * ((uint128_t)f3);
r3 += ((uint128_t)f1_2) * ((uint128_t)f2);
r3 += ((uint128_t)f4_19) * ((uint128_t)f4);
r4 = ((uint128_t)f0_2) * ((uint128_t)f4);
r4 += ((uint128_t)f1_2) * ((uint128_t)f3);
r4 += ((uint128_t)f2) * ((uint128_t)f2);
r00 = ((uint64_t)r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t)r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t)r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t)r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t)r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t)carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t)carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t)carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
}
/*
@ -412,107 +412,107 @@ fe25519_sq(fe25519 h, const fe25519 f)
static void
fe25519_sq2(fe25519 h, const fe25519 f)
{
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t) f0 ) * ((uint128_t) f0);
r0 += ((uint128_t) f1_38) * ((uint128_t) f4);
r0 += ((uint128_t) f2_38) * ((uint128_t) f3);
r1 = ((uint128_t) f0_2 ) * ((uint128_t) f1);
r1 += ((uint128_t) f2_38) * ((uint128_t) f4);
r1 += ((uint128_t) f3_19) * ((uint128_t) f3);
r2 = ((uint128_t) f0_2 ) * ((uint128_t) f2);
r2 += ((uint128_t) f1 ) * ((uint128_t) f1);
r2 += ((uint128_t) f3_38) * ((uint128_t) f4);
r3 = ((uint128_t) f0_2 ) * ((uint128_t) f3);
r3 += ((uint128_t) f1_2 ) * ((uint128_t) f2);
r3 += ((uint128_t) f4_19) * ((uint128_t) f4);
r4 = ((uint128_t) f0_2 ) * ((uint128_t) f4);
r4 += ((uint128_t) f1_2 ) * ((uint128_t) f3);
r4 += ((uint128_t) f2 ) * ((uint128_t) f2);
r0 <<= 1;
r1 <<= 1;
r2 <<= 1;
r3 <<= 1;
r4 <<= 1;
r00 = ((uint64_t) r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t) r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t) r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t) r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t) r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t) carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t) carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t) carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t r0, r1, r2, r3, r4, carry;
uint64_t f0, f1, f2, f3, f4;
uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
uint64_t r00, r01, r02, r03, r04;
f0 = f[0];
f1 = f[1];
f2 = f[2];
f3 = f[3];
f4 = f[4];
f0_2 = f0 << 1;
f1_2 = f1 << 1;
f1_38 = 38ULL * f1;
f2_38 = 38ULL * f2;
f3_38 = 38ULL * f3;
f3_19 = 19ULL * f3;
f4_19 = 19ULL * f4;
r0 = ((uint128_t)f0) * ((uint128_t)f0);
r0 += ((uint128_t)f1_38) * ((uint128_t)f4);
r0 += ((uint128_t)f2_38) * ((uint128_t)f3);
r1 = ((uint128_t)f0_2) * ((uint128_t)f1);
r1 += ((uint128_t)f2_38) * ((uint128_t)f4);
r1 += ((uint128_t)f3_19) * ((uint128_t)f3);
r2 = ((uint128_t)f0_2) * ((uint128_t)f2);
r2 += ((uint128_t)f1) * ((uint128_t)f1);
r2 += ((uint128_t)f3_38) * ((uint128_t)f4);
r3 = ((uint128_t)f0_2) * ((uint128_t)f3);
r3 += ((uint128_t)f1_2) * ((uint128_t)f2);
r3 += ((uint128_t)f4_19) * ((uint128_t)f4);
r4 = ((uint128_t)f0_2) * ((uint128_t)f4);
r4 += ((uint128_t)f1_2) * ((uint128_t)f3);
r4 += ((uint128_t)f2) * ((uint128_t)f2);
r0 <<= 1;
r1 <<= 1;
r2 <<= 1;
r3 <<= 1;
r4 <<= 1;
r00 = ((uint64_t)r0) & mask;
carry = r0 >> 51;
r1 += carry;
r01 = ((uint64_t)r1) & mask;
carry = r1 >> 51;
r2 += carry;
r02 = ((uint64_t)r2) & mask;
carry = r2 >> 51;
r3 += carry;
r03 = ((uint64_t)r3) & mask;
carry = r3 >> 51;
r4 += carry;
r04 = ((uint64_t)r4) & mask;
carry = r4 >> 51;
r00 += 19ULL * (uint64_t)carry;
carry = r00 >> 51;
r00 &= mask;
r01 += (uint64_t)carry;
carry = r01 >> 51;
r01 &= mask;
r02 += (uint64_t)carry;
h[0] = r00;
h[1] = r01;
h[2] = r02;
h[3] = r03;
h[4] = r04;
}
static void
fe25519_scalar_product(fe25519 h, const fe25519 f, uint32_t n)
{
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t a;
uint128_t sn = (uint128_t) n;
uint64_t h0, h1, h2, h3, h4;
a = f[0] * sn;
h0 = ((uint64_t) a) & mask;
a = f[1] * sn + ((uint64_t) (a >> 51));
h1 = ((uint64_t) a) & mask;
a = f[2] * sn + ((uint64_t) (a >> 51));
h2 = ((uint64_t) a) & mask;
a = f[3] * sn + ((uint64_t) (a >> 51));
h3 = ((uint64_t) a) & mask;
a = f[4] * sn + ((uint64_t) (a >> 51));
h4 = ((uint64_t) a) & mask;
h0 += (a >> 51) * 19ULL;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
const uint64_t mask = 0x7ffffffffffffULL;
uint128_t a;
uint128_t sn = (uint128_t)n;
uint64_t h0, h1, h2, h3, h4;
a = f[0] * sn;
h0 = ((uint64_t)a) & mask;
a = f[1] * sn + ((uint64_t)(a >> 51));
h1 = ((uint64_t)a) & mask;
a = f[2] * sn + ((uint64_t)(a >> 51));
h2 = ((uint64_t)a) & mask;
a = f[3] * sn + ((uint64_t)(a >> 51));
h3 = ((uint64_t)a) & mask;
a = f[4] * sn + ((uint64_t)(a >> 51));
h4 = ((uint64_t)a) & mask;
h0 += (a >> 51) * 19ULL;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
}

@ -1,11 +1,17 @@
#ifndef implementations_H
#define implementations_H
int _crypto_generichash_blake2b_pick_best_implementation(void);
int _crypto_onetimeauth_poly1305_pick_best_implementation(void);
int _crypto_pwhash_argon2_pick_best_implementation(void);
int _crypto_scalarmult_curve25519_pick_best_implementation(void);
int _crypto_stream_chacha20_pick_best_implementation(void);
int _crypto_stream_salsa20_pick_best_implementation(void);
int
_crypto_generichash_blake2b_pick_best_implementation(void);
int
_crypto_onetimeauth_poly1305_pick_best_implementation(void);
int
_crypto_pwhash_argon2_pick_best_implementation(void);
int
_crypto_scalarmult_curve25519_pick_best_implementation(void);
int
_crypto_stream_chacha20_pick_best_implementation(void);
int
_crypto_stream_salsa20_pick_best_implementation(void);
#endif

@ -1,7 +1,9 @@
#ifndef mutex_H
#define mutex_H 1
extern int sodium_crit_enter(void);
extern int sodium_crit_leave(void);
extern int
sodium_crit_enter(void);
extern int
sodium_crit_leave(void);
#endif

@ -4,46 +4,53 @@
#include "common.h"
#ifdef HAVE_INTRIN_H
# include <intrin.h>
#include <intrin.h>
#endif
#if defined(HAVE_EMMINTRIN_H) && \
!(defined(__amd64) || defined(__amd64__) || defined(__x86_64__) || \
defined(_M_X64) || defined(_M_AMD64))
#if defined(HAVE_EMMINTRIN_H) \
&& !(defined(__amd64) || defined(__amd64__) || defined(__x86_64__) \
|| defined(_M_X64) || defined(_M_AMD64))
# include <emmintrin.h>
# include <stdint.h>
#include <emmintrin.h>
#include <stdint.h>
# ifndef _mm_set_epi64x
# define _mm_set_epi64x(Q0, Q1) sodium__mm_set_epi64x((Q0), (Q1))
#ifndef _mm_set_epi64x
#define _mm_set_epi64x(Q0, Q1) sodium__mm_set_epi64x((Q0), (Q1))
static inline __m128i
sodium__mm_set_epi64x(int64_t q1, int64_t q0)
{
union { int64_t as64; int32_t as32[2]; } x0, x1;
x0.as64 = q0; x1.as64 = q1;
return _mm_set_epi32(x1.as32[1], x1.as32[0], x0.as32[1], x0.as32[0]);
union {
int64_t as64;
int32_t as32[2];
} x0, x1;
x0.as64 = q0;
x1.as64 = q1;
return _mm_set_epi32(x1.as32[1], x1.as32[0], x0.as32[1], x0.as32[0]);
}
# endif
#endif
# ifndef _mm_set1_epi64x
# define _mm_set1_epi64x(Q) sodium__mm_set1_epi64x(Q)
#ifndef _mm_set1_epi64x
#define _mm_set1_epi64x(Q) sodium__mm_set1_epi64x(Q)
static inline __m128i
sodium__mm_set1_epi64x(int64_t q)
{
return _mm_set_epi64x(q, q);
return _mm_set_epi64x(q, q);
}
# endif
#endif
# ifndef _mm_cvtsi64_si128
# define _mm_cvtsi64_si128(Q) sodium__mm_cvtsi64_si128(Q)
#ifndef _mm_cvtsi64_si128
#define _mm_cvtsi64_si128(Q) sodium__mm_cvtsi64_si128(Q)
static inline __m128i
sodium__mm_cvtsi64_si128(int64_t q)
{
union { int64_t as64; int32_t as32[2]; } x;
x.as64 = q;
return _mm_setr_epi32(x.as32[0], x.as32[1], 0, 0);
union {
int64_t as64;
int32_t as32[2];
} x;
x.as64 = q;
return _mm_setr_epi32(x.as32[0], x.as32[1], 0, 0);
}
# endif
#endif
#endif

@ -4,19 +4,21 @@
#ifdef __native_client__
# include "export.h"
# include "randombytes.h"
#include "export.h"
#include "randombytes.h"
# ifdef __cplusplus
extern "C" {
# endif
#ifdef __cplusplus
extern "C"
{
#endif
SODIUM_EXPORT
extern struct randombytes_implementation randombytes_nativeclient_implementation;
SODIUM_EXPORT
extern struct randombytes_implementation
randombytes_nativeclient_implementation;
# ifdef __cplusplus
#ifdef __cplusplus
}
# endif
#endif
#endif

@ -5,45 +5,59 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
SODIUM_EXPORT_WEAK
int sodium_runtime_has_neon(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_neon(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_sse2(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_sse2(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_sse3(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_sse3(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_ssse3(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_ssse3(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_sse41(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_sse41(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_avx(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_avx(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_avx2(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_avx2(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_avx512f(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_avx512f(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_pclmul(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_pclmul(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_aesni(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_aesni(void);
SODIUM_EXPORT_WEAK
int sodium_runtime_has_rdrand(void);
SODIUM_EXPORT_WEAK
int
sodium_runtime_has_rdrand(void);
/* ------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------
*/
int _sodium_runtime_get_cpu_features(void);
int
_sodium_runtime_get_cpu_features(void);
#ifdef __cplusplus
}

@ -7,161 +7,188 @@
#include "export.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
#ifndef SODIUM_C99
# if defined(__cplusplus) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L
# define SODIUM_C99(X)
# else
# define SODIUM_C99(X) X
# endif
#if defined(__cplusplus) || !defined(__STDC_VERSION__) \
|| __STDC_VERSION__ < 199901L
#define SODIUM_C99(X)
#else
#define SODIUM_C99(X) X
#endif
#endif
SODIUM_EXPORT
void sodium_memzero(void * const pnt, const size_t len);
SODIUM_EXPORT
void sodium_stackzero(const size_t len);
/*
* WARNING: sodium_memcmp() must be used to verify if two secret keys
* are equal, in constant time.
* It returns 0 if the keys are equal, and -1 if they differ.
* This function is not designed for lexicographical comparisons.
*/
SODIUM_EXPORT
int sodium_memcmp(const void * const b1_, const void * const b2_, size_t len)
__attribute__ ((warn_unused_result));
/*
* sodium_compare() returns -1 if b1_ < b2_, 1 if b1_ > b2_ and 0 if b1_ == b2_
* It is suitable for lexicographical comparisons, or to compare nonces
* and counters stored in little-endian format.
* However, it is slower than sodium_memcmp().
*/
SODIUM_EXPORT
int sodium_compare(const unsigned char *b1_, const unsigned char *b2_,
size_t len)
__attribute__ ((warn_unused_result));
SODIUM_EXPORT
int sodium_is_zero(const unsigned char *n, const size_t nlen);
SODIUM_EXPORT
void sodium_increment(unsigned char *n, const size_t nlen);
SODIUM_EXPORT
void sodium_add(unsigned char *a, const unsigned char *b, const size_t len);
SODIUM_EXPORT
char *sodium_bin2hex(char * const hex, const size_t hex_maxlen,
const unsigned char * const bin, const size_t bin_len);
SODIUM_EXPORT
int sodium_hex2bin(unsigned char * const bin, const size_t bin_maxlen,
const char * const hex, const size_t hex_len,
const char * const ignore, size_t * const bin_len,
const char ** const hex_end);
#define sodium_base64_VARIANT_ORIGINAL 1
SODIUM_EXPORT
void
sodium_memzero(void *const pnt, const size_t len);
SODIUM_EXPORT
void
sodium_stackzero(const size_t len);
/*
* WARNING: sodium_memcmp() must be used to verify if two secret keys
* are equal, in constant time.
* It returns 0 if the keys are equal, and -1 if they differ.
* This function is not designed for lexicographical comparisons.
*/
SODIUM_EXPORT
int
sodium_memcmp(const void *const b1_, const void *const b2_, size_t len)
__attribute__((warn_unused_result));
/*
* sodium_compare() returns -1 if b1_ < b2_, 1 if b1_ > b2_ and 0 if b1_ ==
* b2_ It is suitable for lexicographical comparisons, or to compare nonces
* and counters stored in little-endian format.
* However, it is slower than sodium_memcmp().
*/
SODIUM_EXPORT
int
sodium_compare(const unsigned char *b1_, const unsigned char *b2_, size_t len)
__attribute__((warn_unused_result));
SODIUM_EXPORT
int
sodium_is_zero(const unsigned char *n, const size_t nlen);
SODIUM_EXPORT
void
sodium_increment(unsigned char *n, const size_t nlen);
SODIUM_EXPORT
void
sodium_add(unsigned char *a, const unsigned char *b, const size_t len);
SODIUM_EXPORT
char *
sodium_bin2hex(char *const hex, const size_t hex_maxlen,
const unsigned char *const bin, const size_t bin_len);
SODIUM_EXPORT
int
sodium_hex2bin(unsigned char *const bin, const size_t bin_maxlen,
const char *const hex, const size_t hex_len,
const char *const ignore, size_t *const bin_len,
const char **const hex_end);
#define sodium_base64_VARIANT_ORIGINAL 1
#define sodium_base64_VARIANT_ORIGINAL_NO_PADDING 3
#define sodium_base64_VARIANT_URLSAFE 5
#define sodium_base64_VARIANT_URLSAFE_NO_PADDING 7
#define sodium_base64_VARIANT_URLSAFE 5
#define sodium_base64_VARIANT_URLSAFE_NO_PADDING 7
/*
* Computes the required length to encode BIN_LEN bytes as a base64 string
* using the given variant. The computed length includes a trailing \0.
*/
#define sodium_base64_ENCODED_LEN(BIN_LEN, VARIANT) \
(((BIN_LEN) / 3U) * 4U + \
((((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) | (((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) >> 1)) & 1U) * \
(4U - (~((((VARIANT) & 2U) >> 1) - 1U) & (3U - ((BIN_LEN) - ((BIN_LEN) / 3U) * 3U)))) + 1U)
SODIUM_EXPORT
size_t sodium_base64_encoded_len(const size_t bin_len, const int variant);
SODIUM_EXPORT
char *sodium_bin2base64(char * const b64, const size_t b64_maxlen,
const unsigned char * const bin, const size_t bin_len,
const int variant);
SODIUM_EXPORT
int sodium_base642bin(unsigned char * const bin, const size_t bin_maxlen,
const char * const b64, const size_t b64_len,
const char * const ignore, size_t * const bin_len,
const char ** const b64_end, const int variant);
SODIUM_EXPORT
int sodium_mlock(void * const addr, const size_t len);
SODIUM_EXPORT
int sodium_munlock(void * const addr, const size_t len);
/* WARNING: sodium_malloc() and sodium_allocarray() are not general-purpose
* allocation functions.
*
* They return a pointer to a region filled with 0xd0 bytes, immediately
* followed by a guard page.
* As a result, accessing a single byte after the requested allocation size
* will intentionally trigger a segmentation fault.
*
* A canary and an additional guard page placed before the beginning of the
* region may also kill the process if a buffer underflow is detected.
*
* The memory layout is:
* [unprotected region size (read only)][guard page (no access)][unprotected pages (read/write)][guard page (no access)]
* With the layout of the unprotected pages being:
* [optional padding][16-bytes canary][user region]
*
* However:
* - These functions are significantly slower than standard functions
* - Each allocation requires 3 or 4 additional pages
* - The returned address will not be aligned if the allocation size is not
* a multiple of the required alignment. For this reason, these functions
* are designed to store data, such as secret keys and messages.
*
* sodium_malloc() can be used to allocate any libsodium data structure.
*
* The crypto_generichash_state structure is packed and its length is
* either 357 or 361 bytes. For this reason, when using sodium_malloc() to
* allocate a crypto_generichash_state structure, padding must be added in
* order to ensure proper alignment. crypto_generichash_statebytes()
* returns the rounded up structure size, and should be prefered to sizeof():
* state = sodium_malloc(crypto_generichash_statebytes());
*/
SODIUM_EXPORT
void *sodium_malloc(const size_t size)
__attribute__ ((malloc));
SODIUM_EXPORT
void *sodium_allocarray(size_t count, size_t size)
__attribute__ ((malloc));
SODIUM_EXPORT
void sodium_free(void *ptr);
SODIUM_EXPORT
int sodium_mprotect_noaccess(void *ptr);
SODIUM_EXPORT
int sodium_mprotect_readonly(void *ptr);
SODIUM_EXPORT
int sodium_mprotect_readwrite(void *ptr);
SODIUM_EXPORT
int sodium_pad(size_t *padded_buflen_p, unsigned char *buf,
size_t unpadded_buflen, size_t blocksize, size_t max_buflen);
SODIUM_EXPORT
int sodium_unpad(size_t *unpadded_buflen_p, const unsigned char *buf,
size_t padded_buflen, size_t blocksize);
/* -------- */
int _sodium_alloc_init(void);
#define sodium_base64_ENCODED_LEN(BIN_LEN, VARIANT) \
(((BIN_LEN) / 3U) * 4U \
+ ((((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) \
| (((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) >> 1)) \
& 1U) \
* (4U \
- (~((((VARIANT)&2U) >> 1) - 1U) \
& (3U - ((BIN_LEN) - ((BIN_LEN) / 3U) * 3U)))) \
+ 1U)
SODIUM_EXPORT
size_t
sodium_base64_encoded_len(const size_t bin_len, const int variant);
SODIUM_EXPORT
char *
sodium_bin2base64(char *const b64, const size_t b64_maxlen,
const unsigned char *const bin, const size_t bin_len,
const int variant);
SODIUM_EXPORT
int
sodium_base642bin(unsigned char *const bin, const size_t bin_maxlen,
const char *const b64, const size_t b64_len,
const char *const ignore, size_t *const bin_len,
const char **const b64_end, const int variant);
SODIUM_EXPORT
int
sodium_mlock(void *const addr, const size_t len);
SODIUM_EXPORT
int
sodium_munlock(void *const addr, const size_t len);
/* WARNING: sodium_malloc() and sodium_allocarray() are not general-purpose
* allocation functions.
*
* They return a pointer to a region filled with 0xd0 bytes, immediately
* followed by a guard page.
* As a result, accessing a single byte after the requested allocation size
* will intentionally trigger a segmentation fault.
*
* A canary and an additional guard page placed before the beginning of the
* region may also kill the process if a buffer underflow is detected.
*
* The memory layout is:
* [unprotected region size (read only)][guard page (no access)][unprotected
* pages (read/write)][guard page (no access)] With the layout of the
* unprotected pages being: [optional padding][16-bytes canary][user region]
*
* However:
* - These functions are significantly slower than standard functions
* - Each allocation requires 3 or 4 additional pages
* - The returned address will not be aligned if the allocation size is not
* a multiple of the required alignment. For this reason, these functions
* are designed to store data, such as secret keys and messages.
*
* sodium_malloc() can be used to allocate any libsodium data structure.
*
* The crypto_generichash_state structure is packed and its length is
* either 357 or 361 bytes. For this reason, when using sodium_malloc() to
* allocate a crypto_generichash_state structure, padding must be added in
* order to ensure proper alignment. crypto_generichash_statebytes()
* returns the rounded up structure size, and should be prefered to sizeof():
* state = sodium_malloc(crypto_generichash_statebytes());
*/
SODIUM_EXPORT
void *
sodium_malloc(const size_t size) __attribute__((malloc));
SODIUM_EXPORT
void *
sodium_allocarray(size_t count, size_t size) __attribute__((malloc));
SODIUM_EXPORT
void
sodium_free(void *ptr);
SODIUM_EXPORT
int
sodium_mprotect_noaccess(void *ptr);
SODIUM_EXPORT
int
sodium_mprotect_readonly(void *ptr);
SODIUM_EXPORT
int
sodium_mprotect_readwrite(void *ptr);
SODIUM_EXPORT
int
sodium_pad(size_t *padded_buflen_p, unsigned char *buf,
size_t unpadded_buflen, size_t blocksize, size_t max_buflen);
SODIUM_EXPORT
int
sodium_unpad(size_t *unpadded_buflen_p, const unsigned char *buf,
size_t padded_buflen, size_t blocksize);
/* -------- */
int
_sodium_alloc_init(void);
#ifdef __cplusplus
}

@ -4,424 +4,463 @@
typedef crypto_int32 int32;
static inline void minmax(int32 *x,int32 *y)
static inline void
minmax(int32 *x, int32 *y)
{
asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg %%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
: : "r"(x),"r"(y) : "%eax","%ebx","%edx");
asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg "
"%%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
:
: "r"(x), "r"(y)
: "%eax", "%ebx", "%edx");
}
/* sort x0,x2; sort x1,x3; ... sort x13, x15 */
static inline void minmax02through1315(int32 *x)
static inline void
minmax02through1315(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i c = _mm256_unpacklo_epi64(a,b); /* a01b01a45b45 */
__m256i d = _mm256_unpackhi_epi64(a,b); /* a23b23a67b67 */
__m256i g = _mm256_min_epi32(c,d);
__m256i h = _mm256_max_epi32(c,d);
a = _mm256_unpacklo_epi64(g,h);
b = _mm256_unpackhi_epi64(g,h);
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *) (x + 8),b);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i c = _mm256_unpacklo_epi64(a, b); /* a01b01a45b45 */
__m256i d = _mm256_unpackhi_epi64(a, b); /* a23b23a67b67 */
__m256i g = _mm256_min_epi32(c, d);
__m256i h = _mm256_max_epi32(c, d);
a = _mm256_unpacklo_epi64(g, h);
b = _mm256_unpackhi_epi64(g, h);
_mm256_storeu_si256((__m256i *)x, a);
_mm256_storeu_si256((__m256i *)(x + 8), b);
}
/* sort x0,x2; sort x1,x3; sort x4,x6; sort x5,x7 */
static inline void minmax02134657(int32 *x)
static inline void
minmax02134657(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_shuffle_epi32(a,0x4e);
__m256i c = _mm256_cmpgt_epi32(a,b);
c = _mm256_shuffle_epi32(c,0x44);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_shuffle_epi32(a, 0x4e);
__m256i c = _mm256_cmpgt_epi32(a, b);
c = _mm256_shuffle_epi32(c, 0x44);
__m256i abc = c & (a ^ b);
a ^= abc;
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *)x, a);
}
static void multiminmax2plus2(
int32 *x,
int n)
static void
multiminmax2plus2(int32 *x, int n)
{
while (n >= 16) {
while(n >= 16)
{
minmax02through1315(x);
n -= 16;
x += 16;
}
if (n >= 8) {
if(n >= 8)
{
minmax02134657(x);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + 2);
minmax(x + 1,x + 3);
if(n >= 4)
{
minmax(x, x + 2);
minmax(x + 1, x + 3);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + 2);
if (n > 1) minmax(x + 1,x + 3);
if(n > 0)
{
minmax(x, x + 2);
if(n > 1)
minmax(x + 1, x + 3);
}
}
static void multiminmax2plus6(
int32 *x,
int n)
static void
multiminmax2plus6(int32 *x, int n)
{
while (n >= 4) {
minmax(x,x + 6);
minmax(x + 1,x + 7);
while(n >= 4)
{
minmax(x, x + 6);
minmax(x + 1, x + 7);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + 6);
if (n > 1) minmax(x + 1,x + 7);
if(n > 0)
{
minmax(x, x + 6);
if(n > 1)
minmax(x + 1, x + 7);
}
}
static void multiminmax2plus14(
int32 *x,
int n)
static void
multiminmax2plus14(int32 *x, int n)
{
while (n >= 8) {
minmax(x,x + 14);
minmax(x + 1,x + 15);
minmax(x + 4,x + 18);
minmax(x + 5,x + 19);
while(n >= 8)
{
minmax(x, x + 14);
minmax(x + 1, x + 15);
minmax(x + 4, x + 18);
minmax(x + 5, x + 19);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + 14);
minmax(x + 1,x + 15);
if(n >= 4)
{
minmax(x, x + 14);
minmax(x + 1, x + 15);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + 14);
if (n > 1) minmax(x + 1,x + 15);
if(n > 0)
{
minmax(x, x + 14);
if(n > 1)
minmax(x + 1, x + 15);
}
}
/* sort x[i],y[i] for i in 0,1,4,5,8,9,12,13 */
/* all of x0...x15 and y0...y15 must exist; no aliasing */
static inline void minmax0145891213(int32 *x,int32 *y)
static inline void
minmax0145891213(int32 *x, int32 *y)
{
__m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
__m256i a01234567 = _mm256_loadu_si256((__m256i *)x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *)y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *)(y + 8));
__m256i a0189451213 = _mm256_unpacklo_epi64(a01234567,a89101112131415);
__m256i b0189451213 = _mm256_unpacklo_epi64(b01234567,b89101112131415);
__m256i c0189451213 = _mm256_min_epi32(a0189451213,b0189451213);
__m256i d0189451213 = _mm256_max_epi32(a0189451213,b0189451213);
__m256i a0189451213 = _mm256_unpacklo_epi64(a01234567, a89101112131415);
__m256i b0189451213 = _mm256_unpacklo_epi64(b01234567, b89101112131415);
__m256i c0189451213 = _mm256_min_epi32(a0189451213, b0189451213);
__m256i d0189451213 = _mm256_max_epi32(a0189451213, b0189451213);
__m256i c01234567 = _mm256_blend_epi32(a01234567,c0189451213,0x33);
__m256i d01234567 = _mm256_blend_epi32(b01234567,d0189451213,0x33);
__m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213,a89101112131415);
__m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213,b89101112131415);
__m256i c01234567 = _mm256_blend_epi32(a01234567, c0189451213, 0x33);
__m256i d01234567 = _mm256_blend_epi32(b01234567, d0189451213, 0x33);
__m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213, a89101112131415);
__m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213, b89101112131415);
_mm256_storeu_si256((__m256i *) x,c01234567);
_mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
_mm256_storeu_si256((__m256i *) y,d01234567);
_mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
_mm256_storeu_si256((__m256i *)x, c01234567);
_mm256_storeu_si256((__m256i *)(x + 8), c89101112131415);
_mm256_storeu_si256((__m256i *)y, d01234567);
_mm256_storeu_si256((__m256i *)(y + 8), d89101112131415);
}
/* offset >= 30 */
static void multiminmax2plusmore(
int32 *x,
int n,
int offset)
static void
multiminmax2plusmore(int32 *x, int n, int offset)
{
while (n >= 16) {
minmax0145891213(x,x + offset);
while(n >= 16)
{
minmax0145891213(x, x + offset);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 1,x + 1 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 5,x + 5 + offset);
if(n >= 8)
{
minmax(x, x + offset);
minmax(x + 1, x + 1 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 5, x + 5 + offset);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 1,x + 1 + offset);
if(n >= 4)
{
minmax(x, x + offset);
minmax(x + 1, x + 1 + offset);
n -= 4;
x += 4;
}
if (n > 0) {
minmax(x,x + offset);
if (n > 1) minmax(x + 1,x + 1 + offset);
if(n > 0)
{
minmax(x, x + offset);
if(n > 1)
minmax(x + 1, x + 1 + offset);
}
}
/* sort x0,x1; ... sort x14, x15 */
static inline void minmax01through1415(int32 *x)
static inline void
minmax01through1415(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i c = _mm256_unpacklo_epi32(a,b); /* ab0ab1ab4ab5 */
__m256i d = _mm256_unpackhi_epi32(a,b); /* ab2ab3ab6ab7 */
__m256i e = _mm256_unpacklo_epi32(c,d); /* a02b02a46b46 */
__m256i f = _mm256_unpackhi_epi32(c,d); /* a13b13a57b57 */
__m256i g = _mm256_min_epi32(e,f); /* a02b02a46b46 */
__m256i h = _mm256_max_epi32(e,f); /* a13b13a57b57 */
a = _mm256_unpacklo_epi32(g,h);
b = _mm256_unpackhi_epi32(g,h);
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *) (x + 8),b);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i c = _mm256_unpacklo_epi32(a, b); /* ab0ab1ab4ab5 */
__m256i d = _mm256_unpackhi_epi32(a, b); /* ab2ab3ab6ab7 */
__m256i e = _mm256_unpacklo_epi32(c, d); /* a02b02a46b46 */
__m256i f = _mm256_unpackhi_epi32(c, d); /* a13b13a57b57 */
__m256i g = _mm256_min_epi32(e, f); /* a02b02a46b46 */
__m256i h = _mm256_max_epi32(e, f); /* a13b13a57b57 */
a = _mm256_unpacklo_epi32(g, h);
b = _mm256_unpackhi_epi32(g, h);
_mm256_storeu_si256((__m256i *)x, a);
_mm256_storeu_si256((__m256i *)(x + 8), b);
}
/* sort x0,x1; sort x2,x3; sort x4,x5; sort x6,x7 */
static inline void minmax01234567(int32 *x)
static inline void
minmax01234567(int32 *x)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_shuffle_epi32(a,0xb1);
__m256i c = _mm256_cmpgt_epi32(a,b);
c = _mm256_shuffle_epi32(c,0xa0);
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_shuffle_epi32(a, 0xb1);
__m256i c = _mm256_cmpgt_epi32(a, b);
c = _mm256_shuffle_epi32(c, 0xa0);
__m256i abc = c & (a ^ b);
a ^= abc;
_mm256_storeu_si256((__m256i *) x,a);
_mm256_storeu_si256((__m256i *)x, a);
}
static void multiminmax1plus1(
int32 *x,
int n)
static void
multiminmax1plus1(int32 *x, int n)
{
while (n >= 16) {
while(n >= 16)
{
minmax01through1415(x);
n -= 16;
x += 16;
}
if (n >= 8) {
if(n >= 8)
{
minmax01234567(x);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + 1);
minmax(x + 2,x + 3);
if(n >= 4)
{
minmax(x, x + 1);
minmax(x + 2, x + 3);
n -= 4;
x += 4;
}
if (n >= 2) {
minmax(x,x + 1);
if(n >= 2)
{
minmax(x, x + 1);
n -= 2;
x += 2;
}
if (n > 0)
minmax(x,x + 1);
if(n > 0)
minmax(x, x + 1);
}
static void multiminmax1(
int32 *x,
int n,
int offset)
static void
multiminmax1(int32 *x, int n, int offset)
{
while (n >= 16) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
minmax(x + 8,x + 8 + offset);
minmax(x + 10,x + 10 + offset);
minmax(x + 12,x + 12 + offset);
minmax(x + 14,x + 14 + offset);
while(n >= 16)
{
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 6, x + 6 + offset);
minmax(x + 8, x + 8 + offset);
minmax(x + 10, x + 10 + offset);
minmax(x + 12, x + 12 + offset);
minmax(x + 14, x + 14 + offset);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
if(n >= 8)
{
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 6, x + 6 + offset);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
if(n >= 4)
{
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
n -= 4;
x += 4;
}
if (n >= 2) {
minmax(x,x + offset);
if(n >= 2)
{
minmax(x, x + offset);
n -= 2;
x += 2;
}
if (n > 0)
minmax(x,x + offset);
if(n > 0)
minmax(x, x + offset);
}
/* sort x[i],y[i] for i in 0,2,4,6,8,10,12,14 */
/* all of x0...x15 and y0...y15 must exist; no aliasing */
static inline void minmax02468101214(int32 *x,int32 *y)
static inline void
minmax02468101214(int32 *x, int32 *y)
{
__m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
__m256i a01234567 = _mm256_loadu_si256((__m256i *)x);
__m256i a89101112131415 = _mm256_loadu_si256((__m256i *)(x + 8));
__m256i b01234567 = _mm256_loadu_si256((__m256i *)y);
__m256i b89101112131415 = _mm256_loadu_si256((__m256i *)(y + 8));
__m256i a0819412513 = _mm256_unpacklo_epi32(a01234567,a89101112131415);
__m256i a210311614715 = _mm256_unpackhi_epi32(a01234567,a89101112131415);
__m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513,a210311614715);
__m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513,a210311614715);
__m256i a0819412513 = _mm256_unpacklo_epi32(a01234567, a89101112131415);
__m256i a210311614715 = _mm256_unpackhi_epi32(a01234567, a89101112131415);
__m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513, a210311614715);
__m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513, a210311614715);
__m256i b0819412513 = _mm256_unpacklo_epi32(b01234567,b89101112131415);
__m256i b210311614715 = _mm256_unpackhi_epi32(b01234567,b89101112131415);
__m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513,b210311614715);
__m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513,b210311614715);
__m256i b0819412513 = _mm256_unpacklo_epi32(b01234567, b89101112131415);
__m256i b210311614715 = _mm256_unpackhi_epi32(b01234567, b89101112131415);
__m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513, b210311614715);
__m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513, b210311614715);
__m256i c02810461214 = _mm256_min_epi32(a02810461214,b02810461214);
__m256i d02810461214 = _mm256_max_epi32(a02810461214,b02810461214);
__m256i c02810461214 = _mm256_min_epi32(a02810461214, b02810461214);
__m256i d02810461214 = _mm256_max_epi32(a02810461214, b02810461214);
__m256i c01234567 = _mm256_unpacklo_epi32(c02810461214,a13911571315);
__m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214,a13911571315);
__m256i d01234567 = _mm256_unpacklo_epi32(d02810461214,b13911571315);
__m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214,b13911571315);
_mm256_storeu_si256((__m256i *) x,c01234567);
_mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
_mm256_storeu_si256((__m256i *) y,d01234567);
_mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
__m256i c01234567 = _mm256_unpacklo_epi32(c02810461214, a13911571315);
__m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214, a13911571315);
__m256i d01234567 = _mm256_unpacklo_epi32(d02810461214, b13911571315);
__m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214, b13911571315);
_mm256_storeu_si256((__m256i *)x, c01234567);
_mm256_storeu_si256((__m256i *)(x + 8), c89101112131415);
_mm256_storeu_si256((__m256i *)y, d01234567);
_mm256_storeu_si256((__m256i *)(y + 8), d89101112131415);
}
/* assumes offset >= 31 */
static void multiminmax1plusmore(
int32 *x,
int n,
int offset)
static void
multiminmax1plusmore(int32 *x, int n, int offset)
{
while (n >= 16) {
minmax02468101214(x,x + offset);
while(n >= 16)
{
minmax02468101214(x, x + offset);
n -= 16;
x += 16;
}
if (n >= 8) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
minmax(x + 4,x + 4 + offset);
minmax(x + 6,x + 6 + offset);
if(n >= 8)
{
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
minmax(x + 4, x + 4 + offset);
minmax(x + 6, x + 6 + offset);
n -= 8;
x += 8;
}
if (n >= 4) {
minmax(x,x + offset);
minmax(x + 2,x + 2 + offset);
if(n >= 4)
{
minmax(x, x + offset);
minmax(x + 2, x + 2 + offset);
n -= 4;
x += 4;
}
if (n >= 2) {
minmax(x,x + offset);
if(n >= 2)
{
minmax(x, x + offset);
n -= 2;
x += 2;
}
if (n > 0)
minmax(x,x + offset);
if(n > 0)
minmax(x, x + offset);
}
/* sort x0,y0; sort x1,y1; ...; sort x7,y7 */
static inline void minmax8(int32 *x,int32 *y)
static inline void
minmax8(int32 *x, int32 *y)
{
__m256i a = _mm256_loadu_si256((__m256i *) x);
__m256i b = _mm256_loadu_si256((__m256i *) y);
_mm256_storeu_si256((__m256i *) x,_mm256_min_epi32(a,b));
_mm256_storeu_si256((__m256i *) y,_mm256_max_epi32(a,b));
__m256i a = _mm256_loadu_si256((__m256i *)x);
__m256i b = _mm256_loadu_si256((__m256i *)y);
_mm256_storeu_si256((__m256i *)x, _mm256_min_epi32(a, b));
_mm256_storeu_si256((__m256i *)y, _mm256_max_epi32(a, b));
}
/* assumes p >= 8; implies offset >= 8 */
static void multiminmax_atleast8(int p,
int32 *x,
int n,
int offset)
static void
multiminmax_atleast8(int p, int32 *x, int n, int offset)
{
int i;
while (n >= 2 * p) {
for (i = 0;i < p;i += 8)
minmax8(x + i,x + i + offset);
while(n >= 2 * p)
{
for(i = 0; i < p; i += 8)
minmax8(x + i, x + i + offset);
n -= 2 * p;
x += 2 * p;
}
for (i = 0;i + 8 <= n;i += 8) {
if (i & p) return;
minmax8(x + i,x + i + offset);
for(i = 0; i + 8 <= n; i += 8)
{
if(i & p)
return;
minmax8(x + i, x + i + offset);
}
for (;i < n;++i) {
if (i & p) return;
minmax(x + i,x + i + offset);
for(; i < n; ++i)
{
if(i & p)
return;
minmax(x + i, x + i + offset);
}
}
/* sort x0,y0; sort x1,y1; sort x2,y2; sort x3,y3 */
static inline void minmax4(int32 *x,int32 *y)
static inline void
minmax4(int32 *x, int32 *y)
{
__m128i a = _mm_loadu_si128((__m128i *) x);
__m128i b = _mm_loadu_si128((__m128i *) y);
_mm_storeu_si128((__m128i *) x,_mm_min_epi32(a,b));
_mm_storeu_si128((__m128i *) y,_mm_max_epi32(a,b));
__m128i a = _mm_loadu_si128((__m128i *)x);
__m128i b = _mm_loadu_si128((__m128i *)y);
_mm_storeu_si128((__m128i *)x, _mm_min_epi32(a, b));
_mm_storeu_si128((__m128i *)y, _mm_max_epi32(a, b));
}
static void multiminmax4(
int32 *x,
int n,
int offset)
static void
multiminmax4(int32 *x, int n, int offset)
{
int i;
while (n >= 8) {
minmax4(x,x + offset);
while(n >= 8)
{
minmax4(x, x + offset);
n -= 8;
x += 8;
}
if (n >= 4)
minmax4(x,x + offset);
if(n >= 4)
minmax4(x, x + offset);
else
for (i = 0;i < n;++i)
minmax(x + i,x + i + offset);
for(i = 0; i < n; ++i)
minmax(x + i, x + i + offset);
}
void int32_sort(int32 *x,int n)
void
int32_sort(int32 *x, int n)
{
int top,p,q;
int top, p, q;
if (n < 2) return;
if(n < 2)
return;
top = 1;
while (top < n - top) top += top;
while(top < n - top)
top += top;
for (p = top;p >= 8;p >>= 1) {
multiminmax_atleast8(p,x,n - p,p);
for (q = top;q > p;q >>= 1)
multiminmax_atleast8(p,x + p,n - q,q - p);
}
if (p >= 4) {
multiminmax4(x,n - 4,4);
for (q = top;q > 4;q >>= 1)
multiminmax4(x + 4,n - q,q - 4);
}
if (p >= 2) {
multiminmax2plus2(x,n - 2);
for (q = top;q >= 32;q >>= 1)
multiminmax2plusmore(x + 2,n - q,q - 2);
if (q >= 16)
multiminmax2plus14(x + 2,n - 16);
if (q >= 8)
multiminmax2plus6(x + 2,n - 8);
if (q >= 4)
multiminmax2plus2(x + 2,n - 4);
}
multiminmax1plus1(x,n - 1);
for (q = top;q >= 32;q >>= 1)
multiminmax1plusmore(x + 1,n - q,q - 1);
if (q >= 16)
multiminmax1(x + 1,n - 16,15);
if (q >= 8)
multiminmax1(x + 1,n - 8,7);
if (q >= 4)
multiminmax1(x + 1,n - 4,3);
if (q >= 2)
multiminmax1plus1(x + 1,n - 2);
for(p = top; p >= 8; p >>= 1)
{
multiminmax_atleast8(p, x, n - p, p);
for(q = top; q > p; q >>= 1)
multiminmax_atleast8(p, x + p, n - q, q - p);
}
if(p >= 4)
{
multiminmax4(x, n - 4, 4);
for(q = top; q > 4; q >>= 1)
multiminmax4(x + 4, n - q, q - 4);
}
if(p >= 2)
{
multiminmax2plus2(x, n - 2);
for(q = top; q >= 32; q >>= 1)
multiminmax2plusmore(x + 2, n - q, q - 2);
if(q >= 16)
multiminmax2plus14(x + 2, n - 16);
if(q >= 8)
multiminmax2plus6(x + 2, n - 8);
if(q >= 4)
multiminmax2plus2(x + 2, n - 4);
}
multiminmax1plus1(x, n - 1);
for(q = top; q >= 32; q >>= 1)
multiminmax1plusmore(x + 1, n - q, q - 1);
if(q >= 16)
multiminmax1(x + 1, n - 16, 15);
if(q >= 8)
multiminmax1(x + 1, n - 8, 7);
if(q >= 4)
multiminmax1(x + 1, n - 4, 3);
if(q >= 2)
multiminmax1plus1(x + 1, n - 2);
}
#endif

@ -4,12 +4,15 @@
#include "small.h"
#define r3_mult crypto_kem_sntrup4591761_avx_r3_mult
extern void r3_mult(small *,const small *,const small *);
extern void
r3_mult(small *, const small *, const small *);
#define r3_recip crypto_kem_sntrup4591761_avx_r3_recip
extern int r3_recip(small *,const small *);
extern int
r3_recip(small *, const small *);
#define r3_weightw_mask crypto_kem_sntrup4591761_avx_r3_weightw_mask
extern int r3_weightw_mask(const small *);
extern int
r3_weightw_mask(const small *);
#endif

@ -6,91 +6,102 @@
#include "r3.h"
/* caller must ensure that x-y does not overflow */
static int smaller_mask(int x,int y)
static int
smaller_mask(int x, int y)
{
return (x - y) >> 31;
}
static void vectormod3_product(small *z,int len,const small *x,const small c)
static void
vectormod3_product(small *z, int len, const small *x, const small c)
{
int i;
int minusmask = c;
int plusmask = -c;
int plusmask = -c;
__m256i minusvec, plusvec, zerovec;
minusmask >>= 31;
plusmask >>= 31;
minusvec = _mm256_set1_epi32(minusmask);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
while (len >= 32) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
xi = (xi & plusvec) | (_mm256_sub_epi8(zerovec,xi) & minusvec);
_mm256_storeu_si256((__m256i *) z,xi);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
while(len >= 32)
{
__m256i xi = _mm256_loadu_si256((__m256i *)x);
xi = (xi & plusvec) | (_mm256_sub_epi8(zerovec, xi) & minusvec);
_mm256_storeu_si256((__m256i *)z, xi);
x += 32;
z += 32;
len -= 32;
}
for (i = 0;i < len;++i) z[i] = mod3_product(x[i],c);
for(i = 0; i < len; ++i)
z[i] = mod3_product(x[i], c);
}
static void vectormod3_minusproduct(small *z,int len,const small *x,const small *y,const small c)
static void
vectormod3_minusproduct(small *z, int len, const small *x, const small *y,
const small c)
{
int i;
int minusmask = c;
int plusmask = -c;
int plusmask = -c;
__m256i minusvec, plusvec, zerovec, twovec, fourvec;
minusmask >>= 31;
plusmask >>= 31;
minusvec = _mm256_set1_epi32(minusmask);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
twovec = _mm256_set1_epi32(0x02020202);
fourvec = _mm256_set1_epi32(0x04040404);
while (len >= 32) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
__m256i yi = _mm256_loadu_si256((__m256i *) y);
plusvec = _mm256_set1_epi32(plusmask);
zerovec = _mm256_set1_epi32(0);
twovec = _mm256_set1_epi32(0x02020202);
fourvec = _mm256_set1_epi32(0x04040404);
while(len >= 32)
{
__m256i xi = _mm256_loadu_si256((__m256i *)x);
__m256i yi = _mm256_loadu_si256((__m256i *)y);
__m256i r;
yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec,yi) & minusvec);
xi = _mm256_sub_epi8(xi,yi);
yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec, yi) & minusvec);
xi = _mm256_sub_epi8(xi, yi);
r = _mm256_add_epi8(xi,twovec);
r = _mm256_add_epi8(xi, twovec);
r &= fourvec;
r = _mm256_srli_epi32(r,2);
xi = _mm256_sub_epi8(xi,r);
r = _mm256_add_epi8(r,r);
xi = _mm256_sub_epi8(xi,r);
r = _mm256_srli_epi32(r, 2);
xi = _mm256_sub_epi8(xi, r);
r = _mm256_add_epi8(r, r);
xi = _mm256_sub_epi8(xi, r);
r = _mm256_sub_epi8(twovec,xi);
r = _mm256_sub_epi8(twovec, xi);
r &= fourvec;
r = _mm256_srli_epi32(r,2);
xi = _mm256_add_epi8(xi,r);
r = _mm256_add_epi8(r,r);
xi = _mm256_add_epi8(xi,r);
r = _mm256_srli_epi32(r, 2);
xi = _mm256_add_epi8(xi, r);
r = _mm256_add_epi8(r, r);
xi = _mm256_add_epi8(xi, r);
_mm256_storeu_si256((__m256i *) z,xi);
_mm256_storeu_si256((__m256i *)z, xi);
x += 32;
y += 32;
z += 32;
len -= 32;
}
for (i = 0;i < len;++i) z[i] = mod3_minusproduct(x[i],y[i],c);
for(i = 0; i < len; ++i)
z[i] = mod3_minusproduct(x[i], y[i], c);
}
static void vectormod3_shift(small *z,int len)
static void
vectormod3_shift(small *z, int len)
{
int i;
while (len >= 33) {
__m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 33));
_mm256_storeu_si256((__m256i *) (z + len - 32),zi);
while(len >= 33)
{
__m256i zi = _mm256_loadu_si256((__m256i *)(z + len - 33));
_mm256_storeu_si256((__m256i *)(z + len - 32), zi);
len -= 32;
}
for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
for(i = len - 1; i > 0; --i)
z[i] = z[i - 1];
z[0] = 0;
}
@ -100,12 +111,13 @@ or returning -1 if s is not invertible mod m
r,s are polys of degree <p
m is x^p-x-1
*/
int r3_recip(small *r,const small *s)
int
r3_recip(small *r, const small *s)
{
const int loops = 2*p + 1;
const int loops = 2 * p + 1;
int loop;
small f[768];
small g[769];
small f[768];
small g[769];
small u[1536];
small v[1537];
small c;
@ -114,23 +126,28 @@ int r3_recip(small *r,const small *s)
int e = p;
int swapmask;
for (i = 2;i < p;++i) f[i] = 0;
for(i = 2; i < p; ++i)
f[i] = 0;
f[0] = -1;
f[1] = -1;
f[p] = 1;
/* generalization: can initialize f to any polynomial m */
/* requirements: m has degree exactly p, nonzero constant coefficient */
for (i = 0;i < p;++i) g[i] = s[i];
for(i = 0; i < p; ++i)
g[i] = s[i];
g[p] = 0;
for (i = 0;i <= loops;++i) u[i] = 0;
for(i = 0; i <= loops; ++i)
u[i] = 0;
v[0] = 1;
for (i = 1;i <= loops;++i) v[i] = 0;
for(i = 1; i <= loops; ++i)
v[i] = 0;
loop = 0;
for (;;) {
for(;;)
{
/* e == -1 or d + e + loop <= 2*p */
/* f has degree p: i.e., f[p]!=0 */
@ -141,29 +158,35 @@ int r3_recip(small *r,const small *s)
/* u has degree <=loop (so it fits in loop+1 coefficients) */
/* u[i]==0 for i < p-d */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1
* coefficients) */
/* v has degree <=loop (so it fits in loop+1 coefficients) */
/* v[i]==0 for i < p-e */
/* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
if (loop >= loops) break;
if(loop >= loops)
break;
c = mod3_quotient(g[p],f[p]);
c = mod3_quotient(g[p], f[p]);
vectormod3_minusproduct(g,768,g,f,c);
vectormod3_shift(g,769);
vectormod3_minusproduct(g, 768, g, f, c);
vectormod3_shift(g, 769);
#ifdef SIMPLER
vectormod3_minusproduct(v,1536,v,u,c);
vectormod3_shift(v,1537);
vectormod3_minusproduct(v, 1536, v, u, c);
vectormod3_shift(v, 1537);
#else
if (loop < p) {
vectormod3_minusproduct(v,loop + 1,v,u,c);
vectormod3_shift(v,loop + 2);
} else {
vectormod3_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
vectormod3_shift(v + loop - p,p + 2);
if(loop < p)
{
vectormod3_minusproduct(v, loop + 1, v, u, c);
vectormod3_shift(v, loop + 2);
}
else
{
vectormod3_minusproduct(v + loop - p, p + 1, v + loop - p, u + loop - p,
c);
vectormod3_shift(v + loop - p, p + 2);
}
#endif
@ -171,24 +194,28 @@ int r3_recip(small *r,const small *s)
++loop;
swapmask = smaller_mask(e,d) & mod3_nonzero_mask(g[p]);
swap(&e,&d,sizeof e,swapmask);
swap(f,g,(p + 1) * sizeof(small),swapmask);
swapmask = smaller_mask(e, d) & mod3_nonzero_mask(g[p]);
swap(&e, &d, sizeof e, swapmask);
swap(f, g, (p + 1) * sizeof(small), swapmask);
#ifdef SIMPLER
swap(u,v,1536 * sizeof(small),swapmask);
swap(u, v, 1536 * sizeof(small), swapmask);
#else
if (loop < p) {
swap(u,v,(loop + 1) * sizeof(small),swapmask);
} else {
swap(u + loop - p,v + loop - p,(p + 1) * sizeof(small),swapmask);
if(loop < p)
{
swap(u, v, (loop + 1) * sizeof(small), swapmask);
}
else
{
swap(u + loop - p, v + loop - p, (p + 1) * sizeof(small), swapmask);
}
#endif
}
c = mod3_reciprocal(f[p]);
vectormod3_product(r,p,u + p,c);
for (i = p;i < 768;++i) r[i] = 0;
return smaller_mask(0,d);
vectormod3_product(r, p, u + p, c);
for(i = p; i < 768; ++i)
r[i] = 0;
return smaller_mask(0, d);
}
#endif

@ -5,27 +5,35 @@
#include "small.h"
#define rq_encode crypto_kem_sntrup4591761_avx_rq_encode
extern void rq_encode(unsigned char *,const modq *);
extern void
rq_encode(unsigned char *, const modq *);
#define rq_decode crypto_kem_sntrup4591761_avx_rq_decode
extern void rq_decode(modq *,const unsigned char *);
extern void
rq_decode(modq *, const unsigned char *);
#define rq_roundencode crypto_kem_sntrup4591761_avx_rq_roundencode
extern void rq_roundencode(unsigned char *,const modq *);
extern void
rq_roundencode(unsigned char *, const modq *);
#define rq_decoderounded crypto_kem_sntrup4591761_avx_rq_decoderounded
extern void rq_decoderounded(modq *,const unsigned char *);
extern void
rq_decoderounded(modq *, const unsigned char *);
#define rq_round3 crypto_kem_sntrup4591761_avx_rq_round3
extern void rq_round3(modq *,const modq *);
extern void
rq_round3(modq *, const modq *);
#define rq_mod3 crypto_kem_sntrup4591761_avx_rq_mod3
extern void rq_mod3(small *,const modq *);
extern void
rq_mod3(small *, const modq *);
#define rq_mult crypto_kem_sntrup4591761_avx_rq_mult
extern void rq_mult(modq *,const modq *,const small *);
extern void
rq_mult(modq *, const modq *, const small *);
#define rq_recip3 crypto_kem_sntrup4591761_avx_rq_recip3
int rq_recip3(modq *,const small *);
int
rq_recip3(modq *, const small *);
#endif

@ -12,47 +12,57 @@
// 32-bit hosts only
#ifndef __amd64__
#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
__a[N];}))
#define _mm_extract_epi64(X, N) \
(__extension__({ \
__v2di __a = (__v2di)(X); \
__a[N]; \
}))
#endif
static inline __m256i squeeze(__m256i x)
static inline __m256i
squeeze(__m256i x)
{
__m256i q = _mm256_mulhrs_epi16(x,v7);
q = _mm256_mullo_epi16(q,v4591_16);
return _mm256_sub_epi16(x,q);
__m256i q = _mm256_mulhrs_epi16(x, v7);
q = _mm256_mullo_epi16(q, v4591_16);
return _mm256_sub_epi16(x, q);
}
static inline __m256i freeze(__m256i x)
static inline __m256i
freeze(__m256i x)
{
__m256i mask, x2296, x4591;
x4591 = _mm256_add_epi16(x,v4591_16);
mask = _mm256_srai_epi16(x,15);
x = _mm256_blendv_epi8(x,x4591,mask);
x2296 = _mm256_sub_epi16(x,v2296_16);
mask = _mm256_srai_epi16(x2296,15);
x4591 = _mm256_sub_epi16(x,v4591_16);
x = _mm256_blendv_epi8(x4591,x,mask);
x4591 = _mm256_add_epi16(x, v4591_16);
mask = _mm256_srai_epi16(x, 15);
x = _mm256_blendv_epi8(x, x4591, mask);
x2296 = _mm256_sub_epi16(x, v2296_16);
mask = _mm256_srai_epi16(x2296, 15);
x4591 = _mm256_sub_epi16(x, v4591_16);
x = _mm256_blendv_epi8(x4591, x, mask);
return x;
}
void rq_mod3(small *g,const modq *f)
void
rq_mod3(small *g, const modq *f)
{
int i;
for (i = 0;i < 768;i += 16) {
__m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
for(i = 0; i < 768; i += 16)
{
__m256i x = _mm256_loadu_si256((__m256i *)&f[i]);
__m256i q;
x = _mm256_mullo_epi16(x,v3);
x = _mm256_mullo_epi16(x, v3);
x = squeeze(x);
x = freeze(x);
q = _mm256_mulhrs_epi16(x,v10923_16);
x = _mm256_sub_epi16(x,q);
q = _mm256_add_epi16(q,q);
x = _mm256_sub_epi16(x,q); /* g0 g1 ... g15 */
x = _mm256_packs_epi16(x,x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */
0[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,0),0);
1[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,1),0);
q = _mm256_mulhrs_epi16(x, v10923_16);
x = _mm256_sub_epi16(x, q);
q = _mm256_add_epi16(q, q);
x = _mm256_sub_epi16(x, q); /* g0 g1 ... g15 */
x = _mm256_packs_epi16(x,
x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */
0 [(long long *)&g[i]] =
_mm_extract_epi64(_mm256_extracti128_si256(x, 0), 0);
1 [(long long *)&g[i]] =
_mm_extract_epi64(_mm256_extracti128_si256(x, 1), 0);
}
}
#endif

@ -10,93 +10,103 @@
#define v29234_16 _mm256_set1_epi16(29234)
/* caller must ensure that x-y does not overflow */
static int smaller_mask(int x,int y)
static int
smaller_mask(int x, int y)
{
return (x - y) >> 31;
}
static inline __m256i product(__m256i x,__m256i y)
static inline __m256i
product(__m256i x, __m256i y)
{
__m256i lo, hi, r0, r1, t0, t1, t, s0, s1;
lo = _mm256_mullo_epi16(x,y);
hi = _mm256_mulhi_epi16(x,y);
r0 = _mm256_unpacklo_epi16(lo,hi);
r1 = _mm256_unpackhi_epi16(lo,hi);
t0 = _mm256_srai_epi32(r0,16);
t1 = _mm256_srai_epi32(r1,16);
t = _mm256_packs_epi32(t0,t1);
t = _mm256_mulhrs_epi16(t,v29234_16);
lo = _mm256_mullo_epi16(t,v4591_16);
hi = _mm256_mulhi_epi16(t,v4591_16);
s0 = _mm256_unpacklo_epi16(lo,hi);
s1 = _mm256_unpackhi_epi16(lo,hi);
s0 = _mm256_slli_epi32(s0,4);
s1 = _mm256_slli_epi32(s1,4);
r0 = _mm256_sub_epi32(r0,s0);
r1 = _mm256_sub_epi32(r1,s1);
t0 = _mm256_srai_epi32(r0,8);
t1 = _mm256_srai_epi32(r1,8);
t = _mm256_packs_epi32(t0,t1);
t = _mm256_mulhrs_epi16(t,v1827_16);
lo = _mm256_mullo_epi16(t,v4591_16);
hi = _mm256_mulhi_epi16(t,v4591_16);
s0 = _mm256_unpacklo_epi16(lo,hi);
s1 = _mm256_unpackhi_epi16(lo,hi);
r0 = _mm256_sub_epi32(r0,s0);
r1 = _mm256_sub_epi32(r1,s1);
x = _mm256_packs_epi32(r0,r1);
lo = _mm256_mullo_epi16(x, y);
hi = _mm256_mulhi_epi16(x, y);
r0 = _mm256_unpacklo_epi16(lo, hi);
r1 = _mm256_unpackhi_epi16(lo, hi);
t0 = _mm256_srai_epi32(r0, 16);
t1 = _mm256_srai_epi32(r1, 16);
t = _mm256_packs_epi32(t0, t1);
t = _mm256_mulhrs_epi16(t, v29234_16);
lo = _mm256_mullo_epi16(t, v4591_16);
hi = _mm256_mulhi_epi16(t, v4591_16);
s0 = _mm256_unpacklo_epi16(lo, hi);
s1 = _mm256_unpackhi_epi16(lo, hi);
s0 = _mm256_slli_epi32(s0, 4);
s1 = _mm256_slli_epi32(s1, 4);
r0 = _mm256_sub_epi32(r0, s0);
r1 = _mm256_sub_epi32(r1, s1);
t0 = _mm256_srai_epi32(r0, 8);
t1 = _mm256_srai_epi32(r1, 8);
t = _mm256_packs_epi32(t0, t1);
t = _mm256_mulhrs_epi16(t, v1827_16);
lo = _mm256_mullo_epi16(t, v4591_16);
hi = _mm256_mulhi_epi16(t, v4591_16);
s0 = _mm256_unpacklo_epi16(lo, hi);
s1 = _mm256_unpackhi_epi16(lo, hi);
r0 = _mm256_sub_epi32(r0, s0);
r1 = _mm256_sub_epi32(r1, s1);
x = _mm256_packs_epi32(r0, r1);
return x;
}
static inline __m256i minusproduct(__m256i x,__m256i y,__m256i z)
static inline __m256i
minusproduct(__m256i x, __m256i y, __m256i z)
{
__m256i t;
x = _mm256_sub_epi16(x,product(y,z));
t = _mm256_mulhrs_epi16(x,v7);
t = _mm256_mullo_epi16(t,v4591_16);
x = _mm256_sub_epi16(x,t);
x = _mm256_sub_epi16(x, product(y, z));
t = _mm256_mulhrs_epi16(x, v7);
t = _mm256_mullo_epi16(t, v4591_16);
x = _mm256_sub_epi16(x, t);
return x;
}
static void vectormodq_product(modq *z,int len,const modq *x,const modq c)
static void
vectormodq_product(modq *z, int len, const modq *x, const modq c)
{
__m256i cvec = _mm256_set1_epi16(c);
while (len >= 16) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
xi = product(xi,cvec);
_mm256_storeu_si256((__m256i *) z,xi);
while(len >= 16)
{
__m256i xi = _mm256_loadu_si256((__m256i *)x);
xi = product(xi, cvec);
_mm256_storeu_si256((__m256i *)z, xi);
x += 16;
z += 16;
len -= 16;
}
while (len > 0) {
*z = modq_product(*x,c);
while(len > 0)
{
*z = modq_product(*x, c);
++x;
++z;
--len;
}
}
static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,const modq c)
static void
vectormodq_minusproduct(modq *z, int len, const modq *x, const modq *y,
const modq c)
{
__m256i cvec = _mm256_set1_epi16(c);
while (len >= 16) {
__m256i xi = _mm256_loadu_si256((__m256i *) x);
__m256i yi = _mm256_loadu_si256((__m256i *) y);
xi = minusproduct(xi,yi,cvec);
_mm256_storeu_si256((__m256i *) z,xi);
while(len >= 16)
{
__m256i xi = _mm256_loadu_si256((__m256i *)x);
__m256i yi = _mm256_loadu_si256((__m256i *)y);
xi = minusproduct(xi, yi, cvec);
_mm256_storeu_si256((__m256i *)z, xi);
x += 16;
y += 16;
z += 16;
len -= 16;
}
while (len > 0) {
*z = modq_minusproduct(*x,*y,c);
while(len > 0)
{
*z = modq_minusproduct(*x, *y, c);
++x;
++y;
++z;
@ -104,15 +114,18 @@ static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,
}
}
static void vectormodq_shift(modq *z,int len)
static void
vectormodq_shift(modq *z, int len)
{
int i;
while (len >= 17) {
__m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 17));
_mm256_storeu_si256((__m256i *) (z + len - 16),zi);
while(len >= 17)
{
__m256i zi = _mm256_loadu_si256((__m256i *)(z + len - 17));
_mm256_storeu_si256((__m256i *)(z + len - 16), zi);
len -= 16;
}
for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
for(i = len - 1; i > 0; --i)
z[i] = z[i - 1];
z[0] = 0;
}
@ -122,9 +135,10 @@ or returning -1 if s is not invertible mod m
r,s are polys of degree <p
m is x^p-x-1
*/
int rq_recip3(modq *r,const small *s)
int
rq_recip3(modq *r, const small *s)
{
const int loops = 2*p + 1;
const int loops = 2 * p + 1;
int loop;
modq f[768];
modq g[769];
@ -136,23 +150,28 @@ int rq_recip3(modq *r,const small *s)
int e = p;
int swapmask;
for (i = 2;i < p;++i) f[i] = 0;
for(i = 2; i < p; ++i)
f[i] = 0;
f[0] = -1;
f[1] = -1;
f[p] = 1;
/* generalization: can initialize f to any polynomial m */
/* requirements: m has degree exactly p, nonzero constant coefficient */
for (i = 0;i < p;++i) g[i] = 3 * s[i];
for(i = 0; i < p; ++i)
g[i] = 3 * s[i];
g[p] = 0;
for (i = 0;i <= loops;++i) u[i] = 0;
for(i = 0; i <= loops; ++i)
u[i] = 0;
v[0] = 1;
for (i = 1;i <= loops;++i) v[i] = 0;
for(i = 1; i <= loops; ++i)
v[i] = 0;
loop = 0;
for (;;) {
for(;;)
{
/* e == -1 or d + e + loop <= 2*p */
/* f has degree p: i.e., f[p]!=0 */
@ -163,29 +182,35 @@ int rq_recip3(modq *r,const small *s)
/* u has degree <=loop (so it fits in loop+1 coefficients) */
/* u[i]==0 for i < p-d */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
/* if invertible: u[i]==0 for i < loop-p (so can look at just p+1
* coefficients) */
/* v has degree <=loop (so it fits in loop+1 coefficients) */
/* v[i]==0 for i < p-e */
/* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
if (loop >= loops) break;
if(loop >= loops)
break;
c = modq_quotient(g[p],f[p]);
c = modq_quotient(g[p], f[p]);
vectormodq_minusproduct(g,768,g,f,c);
vectormodq_shift(g,769);
vectormodq_minusproduct(g, 768, g, f, c);
vectormodq_shift(g, 769);
#ifdef SIMPLER
vectormodq_minusproduct(v,1536,v,u,c);
vectormodq_shift(v,1537);
vectormodq_minusproduct(v, 1536, v, u, c);
vectormodq_shift(v, 1537);
#else
if (loop < p) {
vectormodq_minusproduct(v,loop + 1,v,u,c);
vectormodq_shift(v,loop + 2);
} else {
vectormodq_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
vectormodq_shift(v + loop - p,p + 2);
if(loop < p)
{
vectormodq_minusproduct(v, loop + 1, v, u, c);
vectormodq_shift(v, loop + 2);
}
else
{
vectormodq_minusproduct(v + loop - p, p + 1, v + loop - p, u + loop - p,
c);
vectormodq_shift(v + loop - p, p + 2);
}
#endif
@ -193,25 +218,30 @@ int rq_recip3(modq *r,const small *s)
++loop;
swapmask = smaller_mask(e,d) & modq_nonzero_mask(g[p]);
swap(&e,&d,sizeof e,swapmask);
swap(f,g,768 * sizeof(modq),swapmask);
swapmask = smaller_mask(e, d) & modq_nonzero_mask(g[p]);
swap(&e, &d, sizeof e, swapmask);
swap(f, g, 768 * sizeof(modq), swapmask);
#ifdef SIMPLER
swap(u,v,1536 * sizeof(modq),swapmask);
swap(u, v, 1536 * sizeof(modq), swapmask);
#else
if (loop < p) {
swap(u,v,(loop + 1) * sizeof(modq),swapmask);
} else {
swap(u + loop - p,v + loop - p,(p + 1) * sizeof(modq),swapmask);
if(loop < p)
{
swap(u, v, (loop + 1) * sizeof(modq), swapmask);
}
else
{
swap(u + loop - p, v + loop - p, (p + 1) * sizeof(modq), swapmask);
}
#endif
}
c = modq_reciprocal(f[p]);
vectormodq_product(r,p,u + p,c);
for (i = 0;i < p;++i) r[i] = modq_freeze(r[i]);
for (i = p;i < 768;++i) r[i] = 0;
return smaller_mask(0,d);
vectormodq_product(r, p, u + p, c);
for(i = 0; i < p; ++i)
r[i] = modq_freeze(r[i]);
for(i = p; i < 768; ++i)
r[i] = 0;
return smaller_mask(0, d);
}
#endif

@ -6,17 +6,19 @@
#define v3_16 _mm256_set1_epi16(3)
#define v10923_16 _mm256_set1_epi16(10923)
void rq_round3(modq *h,const modq *f)
void
rq_round3(modq *h, const modq *f)
{
int i;
for (i = 0;i < 768;i += 16) {
__m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
for(i = 0; i < 768; i += 16)
{
__m256i x = _mm256_loadu_si256((__m256i *)&f[i]);
__m256i x2;
x = _mm256_mulhrs_epi16(x,v10923_16);
x2 = _mm256_add_epi16(x,x);
x = _mm256_add_epi16(x,x2);
_mm256_storeu_si256((__m256i *) &h[i],x);
x = _mm256_mulhrs_epi16(x, v10923_16);
x2 = _mm256_add_epi16(x, x);
x = _mm256_add_epi16(x, x2);
_mm256_storeu_si256((__m256i *)&h[i], x);
}
}
#endif

@ -164,35 +164,40 @@ rq_decoderounded(modq *f, const unsigned char *c)
/* x is f0 + f1*1536 + f2*1536^2 */
/* with each f between 0 and 1530 */
f2 = x
f2 =
x
* _mm256_set1_pd(
0.00000042385525173611114052197733521876177320564238470979034900665283203125);
0.00000042385525173611114052197733521876177320564238470979034900665283203125);
f2 = floor(f2);
x -= f2 * _mm256_set1_pd(2359296.0);
f1 = x
f1 =
x
* _mm256_set1_pd(
0.00065104166666666673894681149903362893383018672466278076171875);
0.00065104166666666673894681149903362893383018672466278076171875);
f1 = floor(f1);
x -= f1 * _mm256_set1_pd(1536.0);
f0 = x;
f2 -= _mm256_set1_pd(1531.0)
f2 -=
_mm256_set1_pd(1531.0)
* floor(
f2
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f1 -= _mm256_set1_pd(1531.0)
f2
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f1 -=
_mm256_set1_pd(1531.0)
* floor(
f1
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f0 -= _mm256_set1_pd(1531.0)
f1
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f0 -=
_mm256_set1_pd(1531.0)
* floor(
f0
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f0
* _mm256_set1_pd(
0.0006531678641410842804659875326933615724556148052215576171875));
f2 *= _mm256_set1_pd(3.0);
f2 -= _mm256_set1_pd(2295.0);

@ -2,30 +2,33 @@
#include <immintrin.h>
#include "swap.h"
void swap(void *x,void *y,int bytes,int mask)
void
swap(void *x, void *y, int bytes, int mask)
{
char c = mask;
char c = mask;
__m256i maskvec = _mm256_set1_epi32(mask);
while (bytes >= 32) {
__m256i xi = _mm256_loadu_si256(x);
__m256i yi = _mm256_loadu_si256(y);
__m256i xinew = _mm256_blendv_epi8(xi,yi,maskvec);
__m256i yinew = _mm256_blendv_epi8(yi,xi,maskvec);
_mm256_storeu_si256(x,xinew);
_mm256_storeu_si256(y,yinew);
x = 32 + (char *) x;
y = 32 + (char *) y;
while(bytes >= 32)
{
__m256i xi = _mm256_loadu_si256(x);
__m256i yi = _mm256_loadu_si256(y);
__m256i xinew = _mm256_blendv_epi8(xi, yi, maskvec);
__m256i yinew = _mm256_blendv_epi8(yi, xi, maskvec);
_mm256_storeu_si256(x, xinew);
_mm256_storeu_si256(y, yinew);
x = 32 + (char *)x;
y = 32 + (char *)y;
bytes -= 32;
}
while (bytes > 0) {
char xi = *(char *) x;
char yi = *(char *) y;
char t = c & (xi ^ yi);
while(bytes > 0)
{
char xi = *(char *)x;
char yi = *(char *)y;
char t = c & (xi ^ yi);
xi ^= t;
yi ^= t;
*(char *) x = xi;
*(char *) y = yi;
*(char *)x = xi;
*(char *)y = yi;
++x;
++y;
--bytes;

@ -2,6 +2,7 @@
#define swap_h
#define swap crypto_kem_sntrup4591761_avx_swap
extern void swap(void *,void *,int,int);
extern void
swap(void *, void *, int, int);
#endif

@ -1,7 +1,7 @@
#include <libntrup/ntru.h>
#include <stdbool.h>
#include <stdio.h> // printf
#include <stdio.h> // printf
#if __AVX2__
#include <cpuid.h>

@ -7,7 +7,7 @@
#define qshift 2295
#define p 761
#ifdef _MSC_VER
#define LOOPS 2*p+1
#define LOOPS 2 * p + 1
#endif
#define w 286

@ -4,9 +4,11 @@
#include "small.h"
#define r3_mult crypto_kem_sntrup4591761_ref_r3_mult
extern void r3_mult(small *,const small *,const small *);
extern void
r3_mult(small *, const small *, const small *);
#define r3_recip crypto_kem_sntrup4591761_ref_r3_recip
extern int r3_recip(small *,const small *);
extern int
r3_recip(small *, const small *);
#endif

@ -2,30 +2,34 @@
#include "mod3.h"
#include "r3.h"
void r3_mult(small *h,const small *f,const small *g)
void
r3_mult(small *h, const small *f, const small *g)
{
small fg[p + p - 1];
small result;
int i, j;
for (i = 0;i < p;++i) {
for(i = 0; i < p; ++i)
{
result = 0;
for (j = 0;j <= i;++j)
result = mod3_plusproduct(result,f[j],g[i - j]);
for(j = 0; j <= i; ++j)
result = mod3_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
}
for (i = p;i < p + p - 1;++i) {
for(i = p; i < p + p - 1; ++i)
{
result = 0;
for (j = i - p + 1;j < p;++j)
result = mod3_plusproduct(result,f[j],g[i - j]);
for(j = i - p + 1; j < p; ++j)
result = mod3_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
}
for (i = p + p - 2;i >= p;--i) {
fg[i - p] = mod3_sum(fg[i - p],fg[i]);
fg[i - p + 1] = mod3_sum(fg[i - p + 1],fg[i]);
for(i = p + p - 2; i >= p; --i)
{
fg[i - p] = mod3_sum(fg[i - p], fg[i]);
fg[i - p + 1] = mod3_sum(fg[i - p + 1], fg[i]);
}
for (i = 0;i < p;++i)
for(i = 0; i < p; ++i)
h[i] = fg[i];
}

@ -5,24 +5,31 @@
#include "small.h"
#define rq_encode crypto_kem_sntrup4591761_ref_rq_encode
extern void rq_encode(unsigned char *,const modq *);
extern void
rq_encode(unsigned char *, const modq *);
#define rq_decode crypto_kem_sntrup4591761_ref_rq_decode
extern void rq_decode(modq *,const unsigned char *);
extern void
rq_decode(modq *, const unsigned char *);
#define rq_encoderounded crypto_kem_sntrup4591761_ref_rq_encoderounded
extern void rq_encoderounded(unsigned char *,const modq *);
extern void
rq_encoderounded(unsigned char *, const modq *);
#define rq_decoderounded crypto_kem_sntrup4591761_ref_rq_decoderounded
extern void rq_decoderounded(modq *,const unsigned char *);
extern void
rq_decoderounded(modq *, const unsigned char *);
#define rq_round3 crypto_kem_sntrup4591761_ref_rq_round
extern void rq_round3(modq *,const modq *);
extern void
rq_round3(modq *, const modq *);
#define rq_mult crypto_kem_sntrup4591761_ref_rq_mult
extern void rq_mult(modq *,const modq *,const small *);
extern void
rq_mult(modq *, const modq *, const small *);
#define rq_recip3 crypto_kem_sntrup4591761_ref_rq_recip3
int rq_recip3(modq *,const small *);
int
rq_recip3(modq *, const small *);
#endif

@ -1,30 +1,34 @@
#include "params.h"
#include "rq.h"
void rq_mult(modq *h,const modq *f,const small *g)
void
rq_mult(modq *h, const modq *f, const small *g)
{
modq fg[p + p - 1];
modq result;
int i, j;
for (i = 0;i < p;++i) {
for(i = 0; i < p; ++i)
{
result = 0;
for (j = 0;j <= i;++j)
result = modq_plusproduct(result,f[j],g[i - j]);
for(j = 0; j <= i; ++j)
result = modq_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
}
for (i = p;i < p + p - 1;++i) {
for(i = p; i < p + p - 1; ++i)
{
result = 0;
for (j = i - p + 1;j < p;++j)
result = modq_plusproduct(result,f[j],g[i - j]);
for(j = i - p + 1; j < p; ++j)
result = modq_plusproduct(result, f[j], g[i - j]);
fg[i] = result;
}
for (i = p + p - 2;i >= p;--i) {
fg[i - p] = modq_sum(fg[i - p],fg[i]);
fg[i - p + 1] = modq_sum(fg[i - p + 1],fg[i]);
for(i = p + p - 2; i >= p; --i)
{
fg[i - p] = modq_sum(fg[i - p], fg[i]);
fg[i - p + 1] = modq_sum(fg[i - p + 1], fg[i]);
}
for (i = 0;i < p;++i)
for(i = 0; i < p; ++i)
h[i] = fg[i];
}

@ -1,10 +1,11 @@
#include "params.h"
#include "rq.h"
void rq_round3(modq *h,const modq *f)
void
rq_round3(modq *h, const modq *f)
{
int i;
for (i = 0;i < p;++i)
for(i = 0; i < p; ++i)
h[i] = ((21846 * (f[i] + 2295) + 32768) >> 16) * 3 - 2295;
}

@ -4,34 +4,41 @@
/* XXX: these functions rely on p mod 4 = 1 */
/* all coefficients in -1, 0, 1 */
void small_encode(unsigned char *c,const small *f)
void
small_encode(unsigned char *c, const small *f)
{
small c0;
int i;
for (i = 0;i < p/4;++i) {
for(i = 0; i < p / 4; ++i)
{
c0 = *f++ + 1;
c0 += (*f++ + 1) << 2;
c0 += (*f++ + 1) << 4;
c0 += (*f++ + 1) << 6;
*c++ = c0;
}
c0 = *f++ + 1;
c0 = *f++ + 1;
*c++ = c0;
}
void small_decode(small *f,const unsigned char *c)
void
small_decode(small *f, const unsigned char *c)
{
unsigned char c0;
int i;
for (i = 0;i < p/4;++i) {
c0 = *c++;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
*f++ = ((small) (c0 & 3)) - 1;
for(i = 0; i < p / 4; ++i)
{
c0 = *c++;
*f++ = ((small)(c0 & 3)) - 1;
c0 >>= 2;
*f++ = ((small)(c0 & 3)) - 1;
c0 >>= 2;
*f++ = ((small)(c0 & 3)) - 1;
c0 >>= 2;
*f++ = ((small)(c0 & 3)) - 1;
}
c0 = *c++;
*f++ = ((small) (c0 & 3)) - 1;
c0 = *c++;
*f++ = ((small)(c0 & 3)) - 1;
}

@ -1,19 +1,21 @@
#include "swap.h"
void swap(void *x,void *y,int bytes,int mask)
void
swap(void *x, void *y, int bytes, int mask)
{
int i;
char xi, yi, c, t;
c = mask;
for (i = 0;i < bytes;++i) {
xi = i[(char *) x];
yi = i[(char *) y];
t = c & (xi ^ yi);
for(i = 0; i < bytes; ++i)
{
xi = i[(char *)x];
yi = i[(char *)y];
t = c & (xi ^ yi);
xi ^= t;
yi ^= t;
i[(char *) x] = xi;
i[(char *) y] = yi;
i[(char *)x] = xi;
i[(char *)y] = yi;
}
}

@ -2,6 +2,7 @@
#define swap_h
#define swap crypto_kem_sntrup4591761_ref_swap
extern void swap(void *,void *,int,int);
extern void
swap(void *, void *, int, int);
#endif

@ -36,7 +36,7 @@ sodium_init(void)
return -1; /* LCOV_EXCL_LINE */
}
/* if we're here, we already started properly */
return initialized ? 0: -1;
return initialized ? 0 : -1;
}
_sodium_runtime_get_cpu_features();
_crypto_generichash_blake2b_pick_best_implementation();

@ -10,116 +10,116 @@ crypto_core_salsa(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c,
const int rounds)
{
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
x15;
uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14,
j15;
int i;
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
int i;
j0 = x0 = 0x61707865;
j5 = x5 = 0x3320646e;
j10 = x10 = 0x79622d32;
j15 = x15 = 0x6b206574;
if (c != NULL) {
j0 = x0 = LOAD32_LE(c + 0);
j5 = x5 = LOAD32_LE(c + 4);
j10 = x10 = LOAD32_LE(c + 8);
j15 = x15 = LOAD32_LE(c + 12);
}
j1 = x1 = LOAD32_LE(k + 0);
j2 = x2 = LOAD32_LE(k + 4);
j3 = x3 = LOAD32_LE(k + 8);
j4 = x4 = LOAD32_LE(k + 12);
j11 = x11 = LOAD32_LE(k + 16);
j12 = x12 = LOAD32_LE(k + 20);
j13 = x13 = LOAD32_LE(k + 24);
j14 = x14 = LOAD32_LE(k + 28);
j0 = x0 = 0x61707865;
j5 = x5 = 0x3320646e;
j10 = x10 = 0x79622d32;
j15 = x15 = 0x6b206574;
if(c != NULL)
{
j0 = x0 = LOAD32_LE(c + 0);
j5 = x5 = LOAD32_LE(c + 4);
j10 = x10 = LOAD32_LE(c + 8);
j15 = x15 = LOAD32_LE(c + 12);
}
j1 = x1 = LOAD32_LE(k + 0);
j2 = x2 = LOAD32_LE(k + 4);
j3 = x3 = LOAD32_LE(k + 8);
j4 = x4 = LOAD32_LE(k + 12);
j11 = x11 = LOAD32_LE(k + 16);
j12 = x12 = LOAD32_LE(k + 20);
j13 = x13 = LOAD32_LE(k + 24);
j14 = x14 = LOAD32_LE(k + 28);
j6 = x6 = LOAD32_LE(in + 0);
j7 = x7 = LOAD32_LE(in + 4);
j8 = x8 = LOAD32_LE(in + 8);
j9 = x9 = LOAD32_LE(in + 12);
j6 = x6 = LOAD32_LE(in + 0);
j7 = x7 = LOAD32_LE(in + 4);
j8 = x8 = LOAD32_LE(in + 8);
j9 = x9 = LOAD32_LE(in + 12);
for (i = 0; i < rounds; i += 2) {
x4 ^= ROTL32(x0 + x12, 7);
x8 ^= ROTL32(x4 + x0, 9);
x12 ^= ROTL32(x8 + x4, 13);
x0 ^= ROTL32(x12 + x8, 18);
x9 ^= ROTL32(x5 + x1, 7);
x13 ^= ROTL32(x9 + x5, 9);
x1 ^= ROTL32(x13 + x9, 13);
x5 ^= ROTL32(x1 + x13, 18);
x14 ^= ROTL32(x10 + x6, 7);
x2 ^= ROTL32(x14 + x10, 9);
x6 ^= ROTL32(x2 + x14, 13);
x10 ^= ROTL32(x6 + x2, 18);
x3 ^= ROTL32(x15 + x11, 7);
x7 ^= ROTL32(x3 + x15, 9);
x11 ^= ROTL32(x7 + x3, 13);
x15 ^= ROTL32(x11 + x7, 18);
x1 ^= ROTL32(x0 + x3, 7);
x2 ^= ROTL32(x1 + x0, 9);
x3 ^= ROTL32(x2 + x1, 13);
x0 ^= ROTL32(x3 + x2, 18);
x6 ^= ROTL32(x5 + x4, 7);
x7 ^= ROTL32(x6 + x5, 9);
x4 ^= ROTL32(x7 + x6, 13);
x5 ^= ROTL32(x4 + x7, 18);
x11 ^= ROTL32(x10 + x9, 7);
x8 ^= ROTL32(x11 + x10, 9);
x9 ^= ROTL32(x8 + x11, 13);
x10 ^= ROTL32(x9 + x8, 18);
x12 ^= ROTL32(x15 + x14, 7);
x13 ^= ROTL32(x12 + x15, 9);
x14 ^= ROTL32(x13 + x12, 13);
x15 ^= ROTL32(x14 + x13, 18);
}
STORE32_LE(out + 0, x0 + j0);
STORE32_LE(out + 4, x1 + j1);
STORE32_LE(out + 8, x2 + j2);
STORE32_LE(out + 12, x3 + j3);
STORE32_LE(out + 16, x4 + j4);
STORE32_LE(out + 20, x5 + j5);
STORE32_LE(out + 24, x6 + j6);
STORE32_LE(out + 28, x7 + j7);
STORE32_LE(out + 32, x8 + j8);
STORE32_LE(out + 36, x9 + j9);
STORE32_LE(out + 40, x10 + j10);
STORE32_LE(out + 44, x11 + j11);
STORE32_LE(out + 48, x12 + j12);
STORE32_LE(out + 52, x13 + j13);
STORE32_LE(out + 56, x14 + j14);
STORE32_LE(out + 60, x15 + j15);
for(i = 0; i < rounds; i += 2)
{
x4 ^= ROTL32(x0 + x12, 7);
x8 ^= ROTL32(x4 + x0, 9);
x12 ^= ROTL32(x8 + x4, 13);
x0 ^= ROTL32(x12 + x8, 18);
x9 ^= ROTL32(x5 + x1, 7);
x13 ^= ROTL32(x9 + x5, 9);
x1 ^= ROTL32(x13 + x9, 13);
x5 ^= ROTL32(x1 + x13, 18);
x14 ^= ROTL32(x10 + x6, 7);
x2 ^= ROTL32(x14 + x10, 9);
x6 ^= ROTL32(x2 + x14, 13);
x10 ^= ROTL32(x6 + x2, 18);
x3 ^= ROTL32(x15 + x11, 7);
x7 ^= ROTL32(x3 + x15, 9);
x11 ^= ROTL32(x7 + x3, 13);
x15 ^= ROTL32(x11 + x7, 18);
x1 ^= ROTL32(x0 + x3, 7);
x2 ^= ROTL32(x1 + x0, 9);
x3 ^= ROTL32(x2 + x1, 13);
x0 ^= ROTL32(x3 + x2, 18);
x6 ^= ROTL32(x5 + x4, 7);
x7 ^= ROTL32(x6 + x5, 9);
x4 ^= ROTL32(x7 + x6, 13);
x5 ^= ROTL32(x4 + x7, 18);
x11 ^= ROTL32(x10 + x9, 7);
x8 ^= ROTL32(x11 + x10, 9);
x9 ^= ROTL32(x8 + x11, 13);
x10 ^= ROTL32(x9 + x8, 18);
x12 ^= ROTL32(x15 + x14, 7);
x13 ^= ROTL32(x12 + x15, 9);
x14 ^= ROTL32(x13 + x12, 13);
x15 ^= ROTL32(x14 + x13, 18);
}
STORE32_LE(out + 0, x0 + j0);
STORE32_LE(out + 4, x1 + j1);
STORE32_LE(out + 8, x2 + j2);
STORE32_LE(out + 12, x3 + j3);
STORE32_LE(out + 16, x4 + j4);
STORE32_LE(out + 20, x5 + j5);
STORE32_LE(out + 24, x6 + j6);
STORE32_LE(out + 28, x7 + j7);
STORE32_LE(out + 32, x8 + j8);
STORE32_LE(out + 36, x9 + j9);
STORE32_LE(out + 40, x10 + j10);
STORE32_LE(out + 44, x11 + j11);
STORE32_LE(out + 48, x12 + j12);
STORE32_LE(out + 52, x13 + j13);
STORE32_LE(out + 56, x14 + j14);
STORE32_LE(out + 60, x15 + j15);
}
int
crypto_core_salsa20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c)
{
crypto_core_salsa(out, in, k, c, 20);
return 0;
crypto_core_salsa(out, in, k, c, 20);
return 0;
}
size_t
crypto_core_salsa20_outputbytes(void)
{
return crypto_core_salsa20_OUTPUTBYTES;
return crypto_core_salsa20_OUTPUTBYTES;
}
size_t
crypto_core_salsa20_inputbytes(void)
{
return crypto_core_salsa20_INPUTBYTES;
return crypto_core_salsa20_INPUTBYTES;
}
size_t
crypto_core_salsa20_keybytes(void)
{
return crypto_core_salsa20_KEYBYTES;
return crypto_core_salsa20_KEYBYTES;
}
size_t
crypto_core_salsa20_constbytes(void)
{
return crypto_core_salsa20_CONSTBYTES;
return crypto_core_salsa20_CONSTBYTES;
}

@ -13,7 +13,6 @@ Public domain.
#include "../stream_salsa20.h"
#include "salsa20_ref.h"
static int
stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
const unsigned char *k)
@ -132,4 +131,3 @@ struct crypto_stream_salsa20_implementation
SODIUM_C99(.stream =) stream_ref,
SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
};

@ -4,13 +4,13 @@
#include <stdint.h>
typedef struct crypto_stream_salsa20_implementation {
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
typedef struct crypto_stream_salsa20_implementation
{
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k);
} crypto_stream_salsa20_implementation;
#endif

@ -1,195 +1,199 @@
if (bytes > 0) {
__m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint8_t partialblock[64];
unsigned int i;
a0 = diag1;
for (i = 0; i < ROUNDS; i += 4) {
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do { \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
*(uint32_t *) (partialblock + (A * 4)) = in##A; \
*(uint32_t *) (partialblock + (B * 4)) = in##B; \
*(uint32_t *) (partialblock + (C * 4)) = in##C; \
*(uint32_t *) (partialblock + (D * 4)) = in##D; \
} while (0)
if(bytes > 0)
{
__m128i diag0 = _mm_loadu_si128((__m128i *)(x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *)(x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *)(x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *)(x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint8_t partialblock[64];
unsigned int i;
a0 = diag1;
for(i = 0; i < ROUNDS; i += 4)
{
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *)(x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *)(x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *)(x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *)(x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do \
{ \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
*(uint32_t *)(partialblock + (A * 4)) = in##A; \
*(uint32_t *)(partialblock + (B * 4)) = in##B; \
*(uint32_t *)(partialblock + (C * 4)) = in##C; \
*(uint32_t *)(partialblock + (D * 4)) = in##D; \
} while(0)
#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
#undef ONEQUAD
#undef ONEQUAD_SHUFFLE
for (i = 0; i < bytes; i++) {
c[i] = m[i] ^ partialblock[i];
}
for(i = 0; i < bytes; i++)
{
c[i] = m[i] ^ partialblock[i];
}
sodium_memzero(partialblock, sizeof partialblock);
sodium_memzero(partialblock, sizeof partialblock);
}

@ -1,207 +1,211 @@
while (bytes >= 64) {
__m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint32_t in8;
uint32_t in9;
int i;
a0 = diag1;
for (i = 0; i < ROUNDS; i += 4) {
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do { \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
in##A ^= *(uint32_t *) (m + (A * 4)); \
in##B ^= *(uint32_t *) (m + (B * 4)); \
in##C ^= *(uint32_t *) (m + (C * 4)); \
in##D ^= *(uint32_t *) (m + (D * 4)); \
*(uint32_t *) (c + (A * 4)) = in##A; \
*(uint32_t *) (c + (B * 4)) = in##B; \
*(uint32_t *) (c + (C * 4)) = in##C; \
*(uint32_t *) (c + (D * 4)) = in##D; \
} while (0)
while(bytes >= 64)
{
__m128i diag0 = _mm_loadu_si128((__m128i *)(x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *)(x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *)(x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *)(x + 12));
__m128i a0, a1, a2, a3, a4, a5, a6, a7;
__m128i b0, b1, b2, b3, b4, b5, b6, b7;
uint32_t in8;
uint32_t in9;
int i;
a0 = diag1;
for(i = 0; i < ROUNDS; i += 4)
{
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *)(x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *)(x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *)(x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *)(x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do \
{ \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
in##A ^= *(uint32_t *)(m + (A * 4)); \
in##B ^= *(uint32_t *)(m + (B * 4)); \
in##C ^= *(uint32_t *)(m + (C * 4)); \
in##D ^= *(uint32_t *)(m + (D * 4)); \
*(uint32_t *)(c + (A * 4)) = in##A; \
*(uint32_t *)(c + (B * 4)) = in##B; \
*(uint32_t *)(c + (C * 4)) = in##C; \
*(uint32_t *)(c + (D * 4)) = in##D; \
} while(0)
#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
#undef ONEQUAD
#undef ONEQUAD_SHUFFLE
in8 = x[8];
in9 = x[13];
in8++;
if (in8 == 0) {
in9++;
}
x[8] = in8;
x[13] = in9;
c += 64;
m += 64;
bytes -= 64;
in8 = x[8];
in9 = x[13];
in8++;
if(in8 == 0)
{
in9++;
}
x[8] = in8;
x[13] = in9;
c += 64;
m += 64;
bytes -= 64;
}

File diff suppressed because it is too large Load Diff

@ -1,476 +1,471 @@
if (bytes >= 512) {
__m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
y15;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i z0 = _mm256_set1_epi32(x[0]);
__m256i z5 = _mm256_set1_epi32(x[1]);
__m256i z10 = _mm256_set1_epi32(x[2]);
__m256i z15 = _mm256_set1_epi32(x[3]);
__m256i z12 = _mm256_set1_epi32(x[4]);
__m256i z1 = _mm256_set1_epi32(x[5]);
__m256i z6 = _mm256_set1_epi32(x[6]);
__m256i z11 = _mm256_set1_epi32(x[7]);
__m256i z8; /* useless */
__m256i z13 = _mm256_set1_epi32(x[9]);
__m256i z2 = _mm256_set1_epi32(x[10]);
__m256i z7 = _mm256_set1_epi32(x[11]);
__m256i z4 = _mm256_set1_epi32(x[12]);
__m256i z9; /* useless */
__m256i z14 = _mm256_set1_epi32(x[14]);
__m256i z3 = _mm256_set1_epi32(x[15]);
__m256i orig0 = z0;
__m256i orig1 = z1;
__m256i orig2 = z2;
__m256i orig3 = z3;
__m256i orig4 = z4;
__m256i orig5 = z5;
__m256i orig6 = z6;
__m256i orig7 = z7;
__m256i orig8;
__m256i orig9;
__m256i orig10 = z10;
__m256i orig11 = z11;
__m256i orig12 = z12;
__m256i orig13 = z13;
__m256i orig14 = z14;
__m256i orig15 = z15;
uint32_t in8;
uint32_t in9;
int i;
while (bytes >= 512) {
/* vector implementation for z8 and z9 */
/* faster than the naive version for 8 blocks */
const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t8, t9;
uint64_t in89;
in8 = x[8];
in9 = x[13]; /* see arrays above for the address translation */
in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
t8 = _mm256_add_epi64(addv8, z8);
t9 = _mm256_add_epi64(addv9, z9);
z8 = _mm256_unpacklo_epi32(t8, t9);
z9 = _mm256_unpackhi_epi32(t8, t9);
t8 = _mm256_unpacklo_epi32(z8, z9);
t9 = _mm256_unpackhi_epi32(z8, z9);
/* required because unpack* are intra-lane */
z8 = _mm256_permutevar8x32_epi32(t8, permute);
z9 = _mm256_permutevar8x32_epi32(t9, permute);
orig8 = z8;
orig9 = z9;
in89 += 8;
x[8] = in89 & 0xFFFFFFFF;
x[13] = (in89 >> 32) & 0xFFFFFFFF;
z5 = orig5;
z10 = orig10;
z15 = orig15;
z14 = orig14;
z3 = orig3;
z6 = orig6;
z11 = orig11;
z1 = orig1;
z7 = orig7;
z13 = orig13;
z2 = orig2;
z9 = orig9;
z0 = orig0;
z12 = orig12;
z4 = orig4;
z8 = orig8;
for (i = 0; i < ROUNDS; i += 2) {
/* the inner loop is a direct translation (regexp search/replace)
* from the amd64-xmm6 ASM */
__m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
r14, r15;
y4 = z12;
y4 = _mm256_add_epi32(y4, z0);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 7);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 25);
z4 = _mm256_xor_si256(z4, r4);
y9 = z1;
y9 = _mm256_add_epi32(y9, z5);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 7);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 25);
z9 = _mm256_xor_si256(z9, r9);
y8 = z0;
y8 = _mm256_add_epi32(y8, z4);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z5;
y13 = _mm256_add_epi32(y13, z9);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y12 = z4;
y12 = _mm256_add_epi32(y12, z8);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 13);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 19);
z12 = _mm256_xor_si256(z12, r12);
y1 = z9;
y1 = _mm256_add_epi32(y1, z13);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 13);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 19);
z1 = _mm256_xor_si256(z1, r1);
y0 = z8;
y0 = _mm256_add_epi32(y0, z12);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z13;
y5 = _mm256_add_epi32(y5, z1);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y14 = z6;
y14 = _mm256_add_epi32(y14, z10);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 7);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 25);
z14 = _mm256_xor_si256(z14, r14);
y3 = z11;
y3 = _mm256_add_epi32(y3, z15);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 7);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 25);
z3 = _mm256_xor_si256(z3, r3);
y2 = z10;
y2 = _mm256_add_epi32(y2, z14);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z15;
y7 = _mm256_add_epi32(y7, z3);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y6 = z14;
y6 = _mm256_add_epi32(y6, z2);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 13);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 19);
z6 = _mm256_xor_si256(z6, r6);
y11 = z3;
y11 = _mm256_add_epi32(y11, z7);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 13);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 19);
z11 = _mm256_xor_si256(z11, r11);
y10 = z2;
y10 = _mm256_add_epi32(y10, z6);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y1 = z3;
y1 = _mm256_add_epi32(y1, z0);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 7);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 25);
z1 = _mm256_xor_si256(z1, r1);
y15 = z7;
y15 = _mm256_add_epi32(y15, z11);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
y6 = z4;
y6 = _mm256_add_epi32(y6, z5);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 7);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 25);
z6 = _mm256_xor_si256(z6, r6);
y2 = z0;
y2 = _mm256_add_epi32(y2, z1);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z5;
y7 = _mm256_add_epi32(y7, z6);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y3 = z1;
y3 = _mm256_add_epi32(y3, z2);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 13);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 19);
z3 = _mm256_xor_si256(z3, r3);
y4 = z6;
y4 = _mm256_add_epi32(y4, z7);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 13);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 19);
z4 = _mm256_xor_si256(z4, r4);
y0 = z2;
y0 = _mm256_add_epi32(y0, z3);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z7;
y5 = _mm256_add_epi32(y5, z4);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y11 = z9;
y11 = _mm256_add_epi32(y11, z10);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 7);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 25);
z11 = _mm256_xor_si256(z11, r11);
y12 = z14;
y12 = _mm256_add_epi32(y12, z15);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 7);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 25);
z12 = _mm256_xor_si256(z12, r12);
y8 = z10;
y8 = _mm256_add_epi32(y8, z11);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z15;
y13 = _mm256_add_epi32(y13, z12);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y9 = z11;
y9 = _mm256_add_epi32(y9, z8);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 13);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 19);
z9 = _mm256_xor_si256(z9, r9);
y14 = z12;
y14 = _mm256_add_epi32(y14, z13);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 13);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 19);
z14 = _mm256_xor_si256(z14, r14);
y10 = z8;
y10 = _mm256_add_epi32(y10, z9);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y15 = z13;
y15 = _mm256_add_epi32(y15, z14);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
}
if(bytes >= 512)
{
__m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
/* the naive way seems as fast (if not a bit faster) than the vector way */
__m256i z0 = _mm256_set1_epi32(x[0]);
__m256i z5 = _mm256_set1_epi32(x[1]);
__m256i z10 = _mm256_set1_epi32(x[2]);
__m256i z15 = _mm256_set1_epi32(x[3]);
__m256i z12 = _mm256_set1_epi32(x[4]);
__m256i z1 = _mm256_set1_epi32(x[5]);
__m256i z6 = _mm256_set1_epi32(x[6]);
__m256i z11 = _mm256_set1_epi32(x[7]);
__m256i z8; /* useless */
__m256i z13 = _mm256_set1_epi32(x[9]);
__m256i z2 = _mm256_set1_epi32(x[10]);
__m256i z7 = _mm256_set1_epi32(x[11]);
__m256i z4 = _mm256_set1_epi32(x[12]);
__m256i z9; /* useless */
__m256i z14 = _mm256_set1_epi32(x[14]);
__m256i z3 = _mm256_set1_epi32(x[15]);
__m256i orig0 = z0;
__m256i orig1 = z1;
__m256i orig2 = z2;
__m256i orig3 = z3;
__m256i orig4 = z4;
__m256i orig5 = z5;
__m256i orig6 = z6;
__m256i orig7 = z7;
__m256i orig8;
__m256i orig9;
__m256i orig10 = z10;
__m256i orig11 = z11;
__m256i orig12 = z12;
__m256i orig13 = z13;
__m256i orig14 = z14;
__m256i orig15 = z15;
uint32_t in8;
uint32_t in9;
int i;
while(bytes >= 512)
{
/* vector implementation for z8 and z9 */
/* faster than the naive version for 8 blocks */
const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0);
const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4);
const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
__m256i t8, t9;
uint64_t in89;
in8 = x[8];
in9 = x[13]; /* see arrays above for the address translation */
in89 = ((uint64_t)in8) | (((uint64_t)in9) << 32);
z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
t8 = _mm256_add_epi64(addv8, z8);
t9 = _mm256_add_epi64(addv9, z9);
z8 = _mm256_unpacklo_epi32(t8, t9);
z9 = _mm256_unpackhi_epi32(t8, t9);
t8 = _mm256_unpacklo_epi32(z8, z9);
t9 = _mm256_unpackhi_epi32(z8, z9);
/* required because unpack* are intra-lane */
z8 = _mm256_permutevar8x32_epi32(t8, permute);
z9 = _mm256_permutevar8x32_epi32(t9, permute);
orig8 = z8;
orig9 = z9;
in89 += 8;
x[8] = in89 & 0xFFFFFFFF;
x[13] = (in89 >> 32) & 0xFFFFFFFF;
z5 = orig5;
z10 = orig10;
z15 = orig15;
z14 = orig14;
z3 = orig3;
z6 = orig6;
z11 = orig11;
z1 = orig1;
z7 = orig7;
z13 = orig13;
z2 = orig2;
z9 = orig9;
z0 = orig0;
z12 = orig12;
z4 = orig4;
z8 = orig8;
for(i = 0; i < ROUNDS; i += 2)
{
/* the inner loop is a direct translation (regexp search/replace)
* from the amd64-xmm6 ASM */
__m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14,
r15;
y4 = z12;
y4 = _mm256_add_epi32(y4, z0);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 7);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 25);
z4 = _mm256_xor_si256(z4, r4);
y9 = z1;
y9 = _mm256_add_epi32(y9, z5);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 7);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 25);
z9 = _mm256_xor_si256(z9, r9);
y8 = z0;
y8 = _mm256_add_epi32(y8, z4);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z5;
y13 = _mm256_add_epi32(y13, z9);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y12 = z4;
y12 = _mm256_add_epi32(y12, z8);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 13);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 19);
z12 = _mm256_xor_si256(z12, r12);
y1 = z9;
y1 = _mm256_add_epi32(y1, z13);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 13);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 19);
z1 = _mm256_xor_si256(z1, r1);
y0 = z8;
y0 = _mm256_add_epi32(y0, z12);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z13;
y5 = _mm256_add_epi32(y5, z1);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y14 = z6;
y14 = _mm256_add_epi32(y14, z10);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 7);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 25);
z14 = _mm256_xor_si256(z14, r14);
y3 = z11;
y3 = _mm256_add_epi32(y3, z15);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 7);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 25);
z3 = _mm256_xor_si256(z3, r3);
y2 = z10;
y2 = _mm256_add_epi32(y2, z14);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z15;
y7 = _mm256_add_epi32(y7, z3);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y6 = z14;
y6 = _mm256_add_epi32(y6, z2);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 13);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 19);
z6 = _mm256_xor_si256(z6, r6);
y11 = z3;
y11 = _mm256_add_epi32(y11, z7);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 13);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 19);
z11 = _mm256_xor_si256(z11, r11);
y10 = z2;
y10 = _mm256_add_epi32(y10, z6);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y1 = z3;
y1 = _mm256_add_epi32(y1, z0);
r1 = y1;
y1 = _mm256_slli_epi32(y1, 7);
z1 = _mm256_xor_si256(z1, y1);
r1 = _mm256_srli_epi32(r1, 25);
z1 = _mm256_xor_si256(z1, r1);
y15 = z7;
y15 = _mm256_add_epi32(y15, z11);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
y6 = z4;
y6 = _mm256_add_epi32(y6, z5);
r6 = y6;
y6 = _mm256_slli_epi32(y6, 7);
z6 = _mm256_xor_si256(z6, y6);
r6 = _mm256_srli_epi32(r6, 25);
z6 = _mm256_xor_si256(z6, r6);
y2 = z0;
y2 = _mm256_add_epi32(y2, z1);
r2 = y2;
y2 = _mm256_slli_epi32(y2, 9);
z2 = _mm256_xor_si256(z2, y2);
r2 = _mm256_srli_epi32(r2, 23);
z2 = _mm256_xor_si256(z2, r2);
y7 = z5;
y7 = _mm256_add_epi32(y7, z6);
r7 = y7;
y7 = _mm256_slli_epi32(y7, 9);
z7 = _mm256_xor_si256(z7, y7);
r7 = _mm256_srli_epi32(r7, 23);
z7 = _mm256_xor_si256(z7, r7);
y3 = z1;
y3 = _mm256_add_epi32(y3, z2);
r3 = y3;
y3 = _mm256_slli_epi32(y3, 13);
z3 = _mm256_xor_si256(z3, y3);
r3 = _mm256_srli_epi32(r3, 19);
z3 = _mm256_xor_si256(z3, r3);
y4 = z6;
y4 = _mm256_add_epi32(y4, z7);
r4 = y4;
y4 = _mm256_slli_epi32(y4, 13);
z4 = _mm256_xor_si256(z4, y4);
r4 = _mm256_srli_epi32(r4, 19);
z4 = _mm256_xor_si256(z4, r4);
y0 = z2;
y0 = _mm256_add_epi32(y0, z3);
r0 = y0;
y0 = _mm256_slli_epi32(y0, 18);
z0 = _mm256_xor_si256(z0, y0);
r0 = _mm256_srli_epi32(r0, 14);
z0 = _mm256_xor_si256(z0, r0);
y5 = z7;
y5 = _mm256_add_epi32(y5, z4);
r5 = y5;
y5 = _mm256_slli_epi32(y5, 18);
z5 = _mm256_xor_si256(z5, y5);
r5 = _mm256_srli_epi32(r5, 14);
z5 = _mm256_xor_si256(z5, r5);
y11 = z9;
y11 = _mm256_add_epi32(y11, z10);
r11 = y11;
y11 = _mm256_slli_epi32(y11, 7);
z11 = _mm256_xor_si256(z11, y11);
r11 = _mm256_srli_epi32(r11, 25);
z11 = _mm256_xor_si256(z11, r11);
y12 = z14;
y12 = _mm256_add_epi32(y12, z15);
r12 = y12;
y12 = _mm256_slli_epi32(y12, 7);
z12 = _mm256_xor_si256(z12, y12);
r12 = _mm256_srli_epi32(r12, 25);
z12 = _mm256_xor_si256(z12, r12);
y8 = z10;
y8 = _mm256_add_epi32(y8, z11);
r8 = y8;
y8 = _mm256_slli_epi32(y8, 9);
z8 = _mm256_xor_si256(z8, y8);
r8 = _mm256_srli_epi32(r8, 23);
z8 = _mm256_xor_si256(z8, r8);
y13 = z15;
y13 = _mm256_add_epi32(y13, z12);
r13 = y13;
y13 = _mm256_slli_epi32(y13, 9);
z13 = _mm256_xor_si256(z13, y13);
r13 = _mm256_srli_epi32(r13, 23);
z13 = _mm256_xor_si256(z13, r13);
y9 = z11;
y9 = _mm256_add_epi32(y9, z8);
r9 = y9;
y9 = _mm256_slli_epi32(y9, 13);
z9 = _mm256_xor_si256(z9, y9);
r9 = _mm256_srli_epi32(r9, 19);
z9 = _mm256_xor_si256(z9, r9);
y14 = z12;
y14 = _mm256_add_epi32(y14, z13);
r14 = y14;
y14 = _mm256_slli_epi32(y14, 13);
z14 = _mm256_xor_si256(z14, y14);
r14 = _mm256_srli_epi32(r14, 19);
z14 = _mm256_xor_si256(z14, r14);
y10 = z8;
y10 = _mm256_add_epi32(y10, z9);
r10 = y10;
y10 = _mm256_slli_epi32(y10, 18);
z10 = _mm256_xor_si256(z10, y10);
r10 = _mm256_srli_epi32(r10, 14);
z10 = _mm256_xor_si256(z10, r10);
y15 = z13;
y15 = _mm256_add_epi32(y15, z14);
r15 = y15;
y15 = _mm256_slli_epi32(y15, 18);
z15 = _mm256_xor_si256(z15, y15);
r15 = _mm256_srli_epi32(r15, 14);
z15 = _mm256_xor_si256(z15, r15);
}
/* store data ; this macro first transpose data in-registers, and then store
* them in memory. much faster with icc. */
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \
_mm_loadu_si128((__m128i*) (m + 0))); \
_mm_storeu_si128((__m128i*) (c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \
_mm_loadu_si128((__m128i*) (m + 64))); \
_mm_storeu_si128((__m128i*) (c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \
_mm_loadu_si128((__m128i*) (m + 128))); \
_mm_storeu_si128((__m128i*) (c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \
_mm_loadu_si128((__m128i*) (m + 192))); \
_mm_storeu_si128((__m128i*) (c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \
_mm_loadu_si128((__m128i*) (m + 256))); \
_mm_storeu_si128((__m128i*) (c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \
_mm_loadu_si128((__m128i*) (m + 320))); \
_mm_storeu_si128((__m128i*) (c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \
_mm_loadu_si128((__m128i*) (m + 384))); \
_mm_storeu_si128((__m128i*) (c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \
_mm_loadu_si128((__m128i*) (m + 448))); \
_mm_storeu_si128((__m128i*) (c + 448), t3); \
}
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
{ \
__m128i t0, t1, t2, t3; \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \
_mm_loadu_si128((__m128i*)(m + 0))); \
_mm_storeu_si128((__m128i*)(c + 0), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \
_mm_loadu_si128((__m128i*)(m + 64))); \
_mm_storeu_si128((__m128i*)(c + 64), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \
_mm_loadu_si128((__m128i*)(m + 128))); \
_mm_storeu_si128((__m128i*)(c + 128), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \
_mm_loadu_si128((__m128i*)(m + 192))); \
_mm_storeu_si128((__m128i*)(c + 192), t3); \
t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \
_mm_loadu_si128((__m128i*)(m + 256))); \
_mm_storeu_si128((__m128i*)(c + 256), t0); \
t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \
_mm_loadu_si128((__m128i*)(m + 320))); \
_mm_storeu_si128((__m128i*)(c + 320), t1); \
t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \
_mm_loadu_si128((__m128i*)(m + 384))); \
_mm_storeu_si128((__m128i*)(c + 384), t2); \
t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \
_mm_loadu_si128((__m128i*)(m + 448))); \
_mm_storeu_si128((__m128i*)(c + 448), t3); \
}
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
}
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
{ \
ONEQUAD_UNPCK(A, B, C, D); \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \
y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \
y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \
y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \
y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \
y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \
y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \
y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \
y##A = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*) (m + 0))); \
y##B = \
_mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*) (m + 64))); \
y##C = \
_mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*) (m + 128))); \
y##D = \
_mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*) (m + 192))); \
y##A2 = \
_mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*) (m + 256))); \
y##B2 = \
_mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*) (m + 320))); \
y##C2 = \
_mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*) (m + 384))); \
y##D2 = \
_mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*) (m + 448))); \
_mm256_storeu_si256((__m256i*) (c + 0), y##A); \
_mm256_storeu_si256((__m256i*) (c + 64), y##B); \
_mm256_storeu_si256((__m256i*) (c + 128), y##C); \
_mm256_storeu_si256((__m256i*) (c + 192), y##D); \
_mm256_storeu_si256((__m256i*) (c + 256), y##A2); \
_mm256_storeu_si256((__m256i*) (c + 320), y##B2); \
_mm256_storeu_si256((__m256i*) (c + 384), y##C2); \
_mm256_storeu_si256((__m256i*) (c + 448), y##D2); \
}
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
#define ONEQUAD_UNPCK(A, B, C, D) \
{ \
z##A = _mm256_add_epi32(z##A, orig##A); \
z##B = _mm256_add_epi32(z##B, orig##B); \
z##C = _mm256_add_epi32(z##C, orig##C); \
z##D = _mm256_add_epi32(z##D, orig##D); \
y##A = _mm256_unpacklo_epi32(z##A, z##B); \
y##B = _mm256_unpacklo_epi32(z##C, z##D); \
y##C = _mm256_unpackhi_epi32(z##A, z##B); \
y##D = _mm256_unpackhi_epi32(z##C, z##D); \
z##A = _mm256_unpacklo_epi64(y##A, y##B); \
z##B = _mm256_unpackhi_epi64(y##A, y##B); \
z##C = _mm256_unpacklo_epi64(y##C, y##D); \
z##D = _mm256_unpackhi_epi64(y##C, y##D); \
}
#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \
{ \
ONEQUAD_UNPCK(A, B, C, D); \
ONEQUAD_UNPCK(A2, B2, C2, D2); \
y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \
y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \
y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \
y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \
y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \
y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \
y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \
y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \
y##A = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*)(m + 0))); \
y##B = _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*)(m + 64))); \
y##C = _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*)(m + 128))); \
y##D = _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*)(m + 192))); \
y##A2 = _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*)(m + 256))); \
y##B2 = _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*)(m + 320))); \
y##C2 = _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*)(m + 384))); \
y##D2 = _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*)(m + 448))); \
_mm256_storeu_si256((__m256i*)(c + 0), y##A); \
_mm256_storeu_si256((__m256i*)(c + 64), y##B); \
_mm256_storeu_si256((__m256i*)(c + 128), y##C); \
_mm256_storeu_si256((__m256i*)(c + 192), y##D); \
_mm256_storeu_si256((__m256i*)(c + 256), y##A2); \
_mm256_storeu_si256((__m256i*)(c + 320), y##B2); \
_mm256_storeu_si256((__m256i*)(c + 384), y##C2); \
_mm256_storeu_si256((__m256i*)(c + 448), y##D2); \
}
ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
m += 32;
c += 32;
ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
m -= 32;
c -= 32;
#undef ONEQUAD
#undef ONEQUAD_TRANSPOSE
#undef ONEQUAD_UNPCK
#undef ONEOCTO
bytes -= 512;
c += 512;
m += 512;
}
bytes -= 512;
c += 512;
m += 512;
}
}

Loading…
Cancel
Save