make format crypto/

5 years ago · 595b15d538
parent 842fe039bc
commit 595b15d538
86 changed files with 7847 additions and 7010 deletions
--- a/crypto/blake2b/blake2b-compress-avx2.h
+++ b/crypto/blake2b/blake2b-compress-avx2.h
@ -2,30 +2,30 @@
 #ifndef blake2b_compress_avx2_H
 #define blake2b_compress_avx2_H

-#define LOADU128(p) _mm_loadu_si128((__m128i *) (p))
-#define STOREU128(p, r) _mm_storeu_si128((__m128i *) (p), r)
+#define LOADU128(p) _mm_loadu_si128((__m128i *)(p))
+#define STOREU128(p, r) _mm_storeu_si128((__m128i *)(p), r)

-#define LOAD(p) _mm256_load_si256((__m256i *) (p))
-#define STORE(p, r) _mm256_store_si256((__m256i *) (p), r)
+#define LOAD(p) _mm256_load_si256((__m256i *)(p))
+#define STORE(p, r) _mm256_store_si256((__m256i *)(p), r)

-#define LOADU(p) _mm256_loadu_si256((__m256i *) (p))
-#define STOREU(p, r) _mm256_storeu_si256((__m256i *) (p), r)
+#define LOADU(p) _mm256_loadu_si256((__m256i *)(p))
+#define STOREU(p, r) _mm256_storeu_si256((__m256i *)(p), r)

 static inline uint64_t
 LOADU64(const void *p)
 {
-    uint64_t v;
-    memcpy(&v, p, sizeof v);
-    return v;
+  uint64_t v;
+  memcpy(&v, p, sizeof v);
+  return v;
 }

-#define ROTATE16                                                              \
-    _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, \
-                     3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
+#define ROTATE16                                                               \
+  _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, \
+                   4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)

-#define ROTATE24                                                              \
-    _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, \
-                     4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
+#define ROTATE24                                                               \
+  _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, \
+                   5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)

 #define ADD(a, b) _mm256_add_epi64(a, b)
 #define SUB(a, b) _mm256_sub_epi64(a, b)
@ -40,98 +40,104 @@ LOADU64(const void *p)
 #define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))

 #define BLAKE2B_G1_V1(a, b, c, d, m) \
-    do {                             \
-        a = ADD(a, m);               \
-        a = ADD(a, b);               \
-        d = XOR(d, a);               \
-        d = ROT32(d);                \
-        c = ADD(c, d);               \
-        b = XOR(b, c);               \
-        b = ROT24(b);                \
-    } while (0)
+  do                                 \
+  {                                  \
+    a = ADD(a, m);                   \
+    a = ADD(a, b);                   \
+    d = XOR(d, a);                   \
+    d = ROT32(d);                    \
+    c = ADD(c, d);                   \
+    b = XOR(b, c);                   \
+    b = ROT24(b);                    \
+  } while(0)

 #define BLAKE2B_G2_V1(a, b, c, d, m) \
-    do {                             \
-        a = ADD(a, m);               \
-        a = ADD(a, b);               \
-        d = XOR(d, a);               \
-        d = ROT16(d);                \
-        c = ADD(c, d);               \
-        b = XOR(b, c);               \
-        b = ROT63(b);                \
-    } while (0)
-
-#define BLAKE2B_DIAG_V1(a, b, c, d)                               \
-    do {                                                          \
-        d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \
-        c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
-        b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \
-    } while (0)
-
-#define BLAKE2B_UNDIAG_V1(a, b, c, d)                             \
-    do {                                                          \
-        d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \
-        c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
-        b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \
-    } while (0)
+  do                                 \
+  {                                  \
+    a = ADD(a, m);                   \
+    a = ADD(a, b);                   \
+    d = XOR(d, a);                   \
+    d = ROT16(d);                    \
+    c = ADD(c, d);                   \
+    b = XOR(b, c);                   \
+    b = ROT63(b);                    \
+  } while(0)
+
+#define BLAKE2B_DIAG_V1(a, b, c, d)                           \
+  do                                                          \
+  {                                                           \
+    d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \
+    c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
+    b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \
+  } while(0)
+
+#define BLAKE2B_UNDIAG_V1(a, b, c, d)                         \
+  do                                                          \
+  {                                                           \
+    d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \
+    c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
+    b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \
+  } while(0)

 #include "blake2b-load-avx2.h"

 #define BLAKE2B_ROUND_V1(a, b, c, d, r, m) \
-    do {                                   \
-        __m256i b0;                        \
-        BLAKE2B_LOAD_MSG_##r##_1(b0);      \
-        BLAKE2B_G1_V1(a, b, c, d, b0);     \
-        BLAKE2B_LOAD_MSG_##r##_2(b0);      \
-        BLAKE2B_G2_V1(a, b, c, d, b0);     \
-        BLAKE2B_DIAG_V1(a, b, c, d);       \
-        BLAKE2B_LOAD_MSG_##r##_3(b0);      \
-        BLAKE2B_G1_V1(a, b, c, d, b0);     \
-        BLAKE2B_LOAD_MSG_##r##_4(b0);      \
-        BLAKE2B_G2_V1(a, b, c, d, b0);     \
-        BLAKE2B_UNDIAG_V1(a, b, c, d);     \
-    } while (0)
-
-#define BLAKE2B_ROUNDS_V1(a, b, c, d, m)       \
-    do {                                       \
-        BLAKE2B_ROUND_V1(a, b, c, d, 0, (m));  \
-        BLAKE2B_ROUND_V1(a, b, c, d, 1, (m));  \
-        BLAKE2B_ROUND_V1(a, b, c, d, 2, (m));  \
-        BLAKE2B_ROUND_V1(a, b, c, d, 3, (m));  \
-        BLAKE2B_ROUND_V1(a, b, c, d, 4, (m));  \
-        BLAKE2B_ROUND_V1(a, b, c, d, 5, (m));  \
-        BLAKE2B_ROUND_V1(a, b, c, d, 6, (m));  \
-        BLAKE2B_ROUND_V1(a, b, c, d, 7, (m));  \
-        BLAKE2B_ROUND_V1(a, b, c, d, 8, (m));  \
-        BLAKE2B_ROUND_V1(a, b, c, d, 9, (m));  \
-        BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
-        BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
-    } while (0)
-
-#define DECLARE_MESSAGE_WORDS(m)                                         \
-    const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0));   \
-    const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16));  \
-    const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32));  \
-    const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48));  \
-    const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64));  \
-    const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80));  \
-    const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96));  \
-    const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
-    __m256i       t0, t1;
-
-#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1)                      \
-    do {                                                                  \
-        DECLARE_MESSAGE_WORDS(m)                                          \
-        const __m256i iv0 = a;                                            \
-        const __m256i iv1 = b;                                            \
-        __m256i       c   = LOAD(&blake2b_IV[0]);                         \
-        __m256i       d =                                                 \
-            XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
-        BLAKE2B_ROUNDS_V1(a, b, c, d, m);                                 \
-        a = XOR(a, c);                                                    \
-        b = XOR(b, d);                                                    \
-        a = XOR(a, iv0);                                                  \
-        b = XOR(b, iv1);                                                  \
-    } while (0)
+  do                                       \
+  {                                        \
+    __m256i b0;                            \
+    BLAKE2B_LOAD_MSG_##r##_1(b0);          \
+    BLAKE2B_G1_V1(a, b, c, d, b0);         \
+    BLAKE2B_LOAD_MSG_##r##_2(b0);          \
+    BLAKE2B_G2_V1(a, b, c, d, b0);         \
+    BLAKE2B_DIAG_V1(a, b, c, d);           \
+    BLAKE2B_LOAD_MSG_##r##_3(b0);          \
+    BLAKE2B_G1_V1(a, b, c, d, b0);         \
+    BLAKE2B_LOAD_MSG_##r##_4(b0);          \
+    BLAKE2B_G2_V1(a, b, c, d, b0);         \
+    BLAKE2B_UNDIAG_V1(a, b, c, d);         \
+  } while(0)
+
+#define BLAKE2B_ROUNDS_V1(a, b, c, d, m)   \
+  do                                       \
+  {                                        \
+    BLAKE2B_ROUND_V1(a, b, c, d, 0, (m));  \
+    BLAKE2B_ROUND_V1(a, b, c, d, 1, (m));  \
+    BLAKE2B_ROUND_V1(a, b, c, d, 2, (m));  \
+    BLAKE2B_ROUND_V1(a, b, c, d, 3, (m));  \
+    BLAKE2B_ROUND_V1(a, b, c, d, 4, (m));  \
+    BLAKE2B_ROUND_V1(a, b, c, d, 5, (m));  \
+    BLAKE2B_ROUND_V1(a, b, c, d, 6, (m));  \
+    BLAKE2B_ROUND_V1(a, b, c, d, 7, (m));  \
+    BLAKE2B_ROUND_V1(a, b, c, d, 8, (m));  \
+    BLAKE2B_ROUND_V1(a, b, c, d, 9, (m));  \
+    BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
+    BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
+  } while(0)
+
+#define DECLARE_MESSAGE_WORDS(m)                                       \
+  const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0));   \
+  const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16));  \
+  const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32));  \
+  const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48));  \
+  const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64));  \
+  const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80));  \
+  const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96));  \
+  const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
+  __m256i t0, t1;
+
+#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1)                          \
+  do                                                                          \
+  {                                                                           \
+    DECLARE_MESSAGE_WORDS(m)                                                  \
+    const __m256i iv0 = a;                                                    \
+    const __m256i iv1 = b;                                                    \
+    __m256i c         = LOAD(&blake2b_IV[0]);                                 \
+    __m256i d = XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
+    BLAKE2B_ROUNDS_V1(a, b, c, d, m);                                         \
+    a = XOR(a, c);                                                            \
+    b = XOR(b, d);                                                            \
+    a = XOR(a, iv0);                                                          \
+    b = XOR(b, iv1);                                                          \
+  } while(0)

 #endif
--- a/crypto/blake2b/blake2b-compress-sse41.h
+++ b/crypto/blake2b/blake2b-compress-sse41.h
@ -2,102 +2,99 @@
 #ifndef blake2b_compress_sse41_H
 #define blake2b_compress_sse41_H

-#define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p))
-#define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r)
+#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
+#define STOREU(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)

-#define _mm_roti_epi64(x, c)                                         \
-    (-(c) == 32)                                                     \
-        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))            \
-        : (-(c) == 24)                                               \
-              ? _mm_shuffle_epi8((x), r24)                           \
-              : (-(c) == 16)                                         \
-                    ? _mm_shuffle_epi8((x), r16)                     \
-                    : (-(c) == 63)                                   \
-                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
-                                          _mm_add_epi64((x), (x)))   \
-                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
-                                          _mm_slli_epi64((x), 64 - (-(c))))
+#define _mm_roti_epi64(x, c)                                            \
+  (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))        \
+               : (-(c) == 24) ? _mm_shuffle_epi8((x), r24)              \
+                              : (-(c) == 16)                            \
+              ? _mm_shuffle_epi8((x), r16)                              \
+              : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
+                                             _mm_add_epi64((x), (x)))   \
+                             : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
+                                             _mm_slli_epi64((x), 64 - (-(c))))

 #define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
-    row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);                \
-    row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);                \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);                  \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);                  \
                                                                           \
-    row4l = _mm_xor_si128(row4l, row1l);                                   \
-    row4h = _mm_xor_si128(row4h, row1h);                                   \
+  row4l = _mm_xor_si128(row4l, row1l);                                     \
+  row4h = _mm_xor_si128(row4h, row1h);                                     \
                                                                           \
-    row4l = _mm_roti_epi64(row4l, -32);                                    \
-    row4h = _mm_roti_epi64(row4h, -32);                                    \
+  row4l = _mm_roti_epi64(row4l, -32);                                      \
+  row4h = _mm_roti_epi64(row4h, -32);                                      \
                                                                           \
-    row3l = _mm_add_epi64(row3l, row4l);                                   \
-    row3h = _mm_add_epi64(row3h, row4h);                                   \
+  row3l = _mm_add_epi64(row3l, row4l);                                     \
+  row3h = _mm_add_epi64(row3h, row4h);                                     \
                                                                           \
-    row2l = _mm_xor_si128(row2l, row3l);                                   \
-    row2h = _mm_xor_si128(row2h, row3h);                                   \
+  row2l = _mm_xor_si128(row2l, row3l);                                     \
+  row2h = _mm_xor_si128(row2h, row3h);                                     \
                                                                           \
-    row2l = _mm_roti_epi64(row2l, -24);                                    \
-    row2h = _mm_roti_epi64(row2h, -24);
+  row2l = _mm_roti_epi64(row2l, -24);                                      \
+  row2h = _mm_roti_epi64(row2h, -24);

 #define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
-    row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);                \
-    row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);                \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);                  \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);                  \
                                                                           \
-    row4l = _mm_xor_si128(row4l, row1l);                                   \
-    row4h = _mm_xor_si128(row4h, row1h);                                   \
+  row4l = _mm_xor_si128(row4l, row1l);                                     \
+  row4h = _mm_xor_si128(row4h, row1h);                                     \
                                                                           \
-    row4l = _mm_roti_epi64(row4l, -16);                                    \
-    row4h = _mm_roti_epi64(row4h, -16);                                    \
+  row4l = _mm_roti_epi64(row4l, -16);                                      \
+  row4h = _mm_roti_epi64(row4h, -16);                                      \
                                                                           \
-    row3l = _mm_add_epi64(row3l, row4l);                                   \
-    row3h = _mm_add_epi64(row3h, row4h);                                   \
+  row3l = _mm_add_epi64(row3l, row4l);                                     \
+  row3h = _mm_add_epi64(row3h, row4h);                                     \
                                                                           \
-    row2l = _mm_xor_si128(row2l, row3l);                                   \
-    row2h = _mm_xor_si128(row2h, row3h);                                   \
+  row2l = _mm_xor_si128(row2l, row3l);                                     \
+  row2h = _mm_xor_si128(row2h, row3h);                                     \
                                                                           \
-    row2l = _mm_roti_epi64(row2l, -63);                                    \
-    row2h = _mm_roti_epi64(row2h, -63);
+  row2l = _mm_roti_epi64(row2l, -63);                                      \
+  row2h = _mm_roti_epi64(row2h, -63);

 #define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
-    t0    = _mm_alignr_epi8(row2h, row2l, 8);                               \
-    t1    = _mm_alignr_epi8(row2l, row2h, 8);                               \
-    row2l = t0;                                                             \
-    row2h = t1;                                                             \
+  t0    = _mm_alignr_epi8(row2h, row2l, 8);                                 \
+  t1    = _mm_alignr_epi8(row2l, row2h, 8);                                 \
+  row2l = t0;                                                               \
+  row2h = t1;                                                               \
                                                                            \
-    t0    = row3l;                                                          \
-    row3l = row3h;                                                          \
-    row3h = t0;                                                             \
+  t0    = row3l;                                                            \
+  row3l = row3h;                                                            \
+  row3h = t0;                                                               \
                                                                            \
-    t0    = _mm_alignr_epi8(row4h, row4l, 8);                               \
-    t1    = _mm_alignr_epi8(row4l, row4h, 8);                               \
-    row4l = t1;                                                             \
-    row4h = t0;
+  t0    = _mm_alignr_epi8(row4h, row4l, 8);                                 \
+  t1    = _mm_alignr_epi8(row4l, row4h, 8);                                 \
+  row4l = t1;                                                               \
+  row4h = t0;

 #define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
-    t0    = _mm_alignr_epi8(row2l, row2h, 8);                                 \
-    t1    = _mm_alignr_epi8(row2h, row2l, 8);                                 \
-    row2l = t0;                                                               \
-    row2h = t1;                                                               \
+  t0    = _mm_alignr_epi8(row2l, row2h, 8);                                   \
+  t1    = _mm_alignr_epi8(row2h, row2l, 8);                                   \
+  row2l = t0;                                                                 \
+  row2h = t1;                                                                 \
                                                                              \
-    t0    = row3l;                                                            \
-    row3l = row3h;                                                            \
-    row3h = t0;                                                               \
+  t0    = row3l;                                                              \
+  row3l = row3h;                                                              \
+  row3h = t0;                                                                 \
                                                                              \
-    t0    = _mm_alignr_epi8(row4l, row4h, 8);                                 \
-    t1    = _mm_alignr_epi8(row4h, row4l, 8);                                 \
-    row4l = t1;                                                               \
-    row4h = t0;
+  t0    = _mm_alignr_epi8(row4l, row4h, 8);                                   \
+  t1    = _mm_alignr_epi8(row4h, row4l, 8);                                   \
+  row4l = t1;                                                                 \
+  row4h = t0;

 #include "blake2b-load-sse41.h"

-#define ROUND(r)                                                         \
-    LOAD_MSG_##r##_1(b0, b1);                                            \
-    G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
-    LOAD_MSG_##r##_2(b0, b1);                                            \
-    G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
-    DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
-    LOAD_MSG_##r##_3(b0, b1);                                            \
-    G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
-    LOAD_MSG_##r##_4(b0, b1);                                            \
-    G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
-    UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
+#define ROUND(r)                                                       \
+  LOAD_MSG_##r##_1(b0, b1);                                            \
+  G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
+  LOAD_MSG_##r##_2(b0, b1);                                            \
+  G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
+  DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+  LOAD_MSG_##r##_3(b0, b1);                                            \
+  G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
+  LOAD_MSG_##r##_4(b0, b1);                                            \
+  G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
+  UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);

 #endif
--- a/crypto/blake2b/blake2b-compress-ssse3.h
+++ b/crypto/blake2b/blake2b-compress-ssse3.h
@ -2,102 +2,99 @@
 #ifndef blake2b_compress_ssse3_H
 #define blake2b_compress_ssse3_H

-#define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p))
-#define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r)
+#define LOADU(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
+#define STOREU(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)

-#define _mm_roti_epi64(x, c)                                         \
-    (-(c) == 32)                                                     \
-        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))            \
-        : (-(c) == 24)                                               \
-              ? _mm_shuffle_epi8((x), r24)                           \
-              : (-(c) == 16)                                         \
-                    ? _mm_shuffle_epi8((x), r16)                     \
-                    : (-(c) == 63)                                   \
-                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
-                                          _mm_add_epi64((x), (x)))   \
-                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
-                                          _mm_slli_epi64((x), 64 - (-(c))))
+#define _mm_roti_epi64(x, c)                                            \
+  (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))        \
+               : (-(c) == 24) ? _mm_shuffle_epi8((x), r24)              \
+                              : (-(c) == 16)                            \
+              ? _mm_shuffle_epi8((x), r16)                              \
+              : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
+                                             _mm_add_epi64((x), (x)))   \
+                             : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
+                                             _mm_slli_epi64((x), 64 - (-(c))))

 #define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
-    row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);                \
-    row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);                \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);                  \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);                  \
                                                                           \
-    row4l = _mm_xor_si128(row4l, row1l);                                   \
-    row4h = _mm_xor_si128(row4h, row1h);                                   \
+  row4l = _mm_xor_si128(row4l, row1l);                                     \
+  row4h = _mm_xor_si128(row4h, row1h);                                     \
                                                                           \
-    row4l = _mm_roti_epi64(row4l, -32);                                    \
-    row4h = _mm_roti_epi64(row4h, -32);                                    \
+  row4l = _mm_roti_epi64(row4l, -32);                                      \
+  row4h = _mm_roti_epi64(row4h, -32);                                      \
                                                                           \
-    row3l = _mm_add_epi64(row3l, row4l);                                   \
-    row3h = _mm_add_epi64(row3h, row4h);                                   \
+  row3l = _mm_add_epi64(row3l, row4l);                                     \
+  row3h = _mm_add_epi64(row3h, row4h);                                     \
                                                                           \
-    row2l = _mm_xor_si128(row2l, row3l);                                   \
-    row2h = _mm_xor_si128(row2h, row3h);                                   \
+  row2l = _mm_xor_si128(row2l, row3l);                                     \
+  row2h = _mm_xor_si128(row2h, row3h);                                     \
                                                                           \
-    row2l = _mm_roti_epi64(row2l, -24);                                    \
-    row2h = _mm_roti_epi64(row2h, -24);
+  row2l = _mm_roti_epi64(row2l, -24);                                      \
+  row2h = _mm_roti_epi64(row2h, -24);

 #define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
-    row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);                \
-    row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);                \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);                  \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);                  \
                                                                           \
-    row4l = _mm_xor_si128(row4l, row1l);                                   \
-    row4h = _mm_xor_si128(row4h, row1h);                                   \
+  row4l = _mm_xor_si128(row4l, row1l);                                     \
+  row4h = _mm_xor_si128(row4h, row1h);                                     \
                                                                           \
-    row4l = _mm_roti_epi64(row4l, -16);                                    \
-    row4h = _mm_roti_epi64(row4h, -16);                                    \
+  row4l = _mm_roti_epi64(row4l, -16);                                      \
+  row4h = _mm_roti_epi64(row4h, -16);                                      \
                                                                           \
-    row3l = _mm_add_epi64(row3l, row4l);                                   \
-    row3h = _mm_add_epi64(row3h, row4h);                                   \
+  row3l = _mm_add_epi64(row3l, row4l);                                     \
+  row3h = _mm_add_epi64(row3h, row4h);                                     \
                                                                           \
-    row2l = _mm_xor_si128(row2l, row3l);                                   \
-    row2h = _mm_xor_si128(row2h, row3h);                                   \
+  row2l = _mm_xor_si128(row2l, row3l);                                     \
+  row2h = _mm_xor_si128(row2h, row3h);                                     \
                                                                           \
-    row2l = _mm_roti_epi64(row2l, -63);                                    \
-    row2h = _mm_roti_epi64(row2h, -63);
+  row2l = _mm_roti_epi64(row2l, -63);                                      \
+  row2h = _mm_roti_epi64(row2h, -63);

 #define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
-    t0    = _mm_alignr_epi8(row2h, row2l, 8);                               \
-    t1    = _mm_alignr_epi8(row2l, row2h, 8);                               \
-    row2l = t0;                                                             \
-    row2h = t1;                                                             \
+  t0    = _mm_alignr_epi8(row2h, row2l, 8);                                 \
+  t1    = _mm_alignr_epi8(row2l, row2h, 8);                                 \
+  row2l = t0;                                                               \
+  row2h = t1;                                                               \
                                                                            \
-    t0    = row3l;                                                          \
-    row3l = row3h;                                                          \
-    row3h = t0;                                                             \
+  t0    = row3l;                                                            \
+  row3l = row3h;                                                            \
+  row3h = t0;                                                               \
                                                                            \
-    t0    = _mm_alignr_epi8(row4h, row4l, 8);                               \
-    t1    = _mm_alignr_epi8(row4l, row4h, 8);                               \
-    row4l = t1;                                                             \
-    row4h = t0;
+  t0    = _mm_alignr_epi8(row4h, row4l, 8);                                 \
+  t1    = _mm_alignr_epi8(row4l, row4h, 8);                                 \
+  row4l = t1;                                                               \
+  row4h = t0;

 #define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
-    t0    = _mm_alignr_epi8(row2l, row2h, 8);                                 \
-    t1    = _mm_alignr_epi8(row2h, row2l, 8);                                 \
-    row2l = t0;                                                               \
-    row2h = t1;                                                               \
+  t0    = _mm_alignr_epi8(row2l, row2h, 8);                                   \
+  t1    = _mm_alignr_epi8(row2h, row2l, 8);                                   \
+  row2l = t0;                                                                 \
+  row2h = t1;                                                                 \
                                                                              \
-    t0    = row3l;                                                            \
-    row3l = row3h;                                                            \
-    row3h = t0;                                                               \
+  t0    = row3l;                                                              \
+  row3l = row3h;                                                              \
+  row3h = t0;                                                                 \
                                                                              \
-    t0    = _mm_alignr_epi8(row4l, row4h, 8);                                 \
-    t1    = _mm_alignr_epi8(row4h, row4l, 8);                                 \
-    row4l = t1;                                                               \
-    row4h = t0;
+  t0    = _mm_alignr_epi8(row4l, row4h, 8);                                   \
+  t1    = _mm_alignr_epi8(row4h, row4l, 8);                                   \
+  row4l = t1;                                                                 \
+  row4h = t0;

 #include "blake2b-load-sse2.h"

-#define ROUND(r)                                                         \
-    LOAD_MSG_##r##_1(b0, b1);                                            \
-    G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
-    LOAD_MSG_##r##_2(b0, b1);                                            \
-    G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
-    DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
-    LOAD_MSG_##r##_3(b0, b1);                                            \
-    G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
-    LOAD_MSG_##r##_4(b0, b1);                                            \
-    G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
-    UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
+#define ROUND(r)                                                       \
+  LOAD_MSG_##r##_1(b0, b1);                                            \
+  G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
+  LOAD_MSG_##r##_2(b0, b1);                                            \
+  G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
+  DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+  LOAD_MSG_##r##_3(b0, b1);                                            \
+  G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
+  LOAD_MSG_##r##_4(b0, b1);                                            \
+  G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
+  UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);

 #endif
--- a/crypto/blake2b/blake2b-load-avx2.h
+++ b/crypto/blake2b/blake2b-load-avx2.h
@ -1,340 +1,388 @@
 #ifndef blake2b_load_avx2_H
 #define blake2b_load_avx2_H

-#define BLAKE2B_LOAD_MSG_0_1(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m0, m1);    \
-        t1 = _mm256_unpacklo_epi64(m2, m3);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_0_2(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m0, m1);    \
-        t1 = _mm256_unpackhi_epi64(m2, m3);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_0_3(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m4, m5);    \
-        t1 = _mm256_unpacklo_epi64(m6, m7);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_0_4(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m4, m5);    \
-        t1 = _mm256_unpackhi_epi64(m6, m7);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_1_1(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m7, m2);    \
-        t1 = _mm256_unpackhi_epi64(m4, m6);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_1_2(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m5, m4);    \
-        t1 = _mm256_alignr_epi8(m3, m7, 8);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_1_3(b0)                                \
-    do {                                                        \
-        t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
-        t1 = _mm256_unpackhi_epi64(m5, m2);                     \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0);                  \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_1_4(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m6, m1);    \
-        t1 = _mm256_unpackhi_epi64(m3, m1);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_2_1(b0)               \
-    do {                                       \
-        t0 = _mm256_alignr_epi8(m6, m5, 8);    \
-        t1 = _mm256_unpackhi_epi64(m2, m7);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_2_2(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m4, m0);    \
-        t1 = _mm256_blend_epi32(m6, m1, 0x33); \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_2_3(b0)               \
-    do {                                       \
-        t0 = _mm256_blend_epi32(m1, m5, 0x33); \
-        t1 = _mm256_unpackhi_epi64(m3, m4);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_2_4(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m7, m3);    \
-        t1 = _mm256_alignr_epi8(m2, m0, 8);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_3_1(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m3, m1);    \
-        t1 = _mm256_unpackhi_epi64(m6, m5);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_3_2(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m4, m0);    \
-        t1 = _mm256_unpacklo_epi64(m6, m7);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_3_3(b0)               \
-    do {                                       \
-        t0 = _mm256_blend_epi32(m2, m1, 0x33); \
-        t1 = _mm256_blend_epi32(m7, m2, 0x33); \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_3_4(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m3, m5);    \
-        t1 = _mm256_unpacklo_epi64(m0, m4);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_4_1(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m4, m2);    \
-        t1 = _mm256_unpacklo_epi64(m1, m5);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_4_2(b0)               \
-    do {                                       \
-        t0 = _mm256_blend_epi32(m3, m0, 0x33); \
-        t1 = _mm256_blend_epi32(m7, m2, 0x33); \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_4_3(b0)               \
-    do {                                       \
-        t0 = _mm256_blend_epi32(m5, m7, 0x33); \
-        t1 = _mm256_blend_epi32(m1, m3, 0x33); \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_4_4(b0)               \
-    do {                                       \
-        t0 = _mm256_alignr_epi8(m6, m0, 8);    \
-        t1 = _mm256_blend_epi32(m6, m4, 0x33); \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_5_1(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m1, m3);    \
-        t1 = _mm256_unpacklo_epi64(m0, m4);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_5_2(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m6, m5);    \
-        t1 = _mm256_unpackhi_epi64(m5, m1);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_5_3(b0)               \
-    do {                                       \
-        t0 = _mm256_blend_epi32(m3, m2, 0x33); \
-        t1 = _mm256_unpackhi_epi64(m7, m0);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_5_4(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m6, m2);    \
-        t1 = _mm256_blend_epi32(m4, m7, 0x33); \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_6_1(b0)               \
-    do {                                       \
-        t0 = _mm256_blend_epi32(m0, m6, 0x33); \
-        t1 = _mm256_unpacklo_epi64(m7, m2);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_6_2(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m2, m7);    \
-        t1 = _mm256_alignr_epi8(m5, m6, 8);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_6_3(b0)                                \
-    do {                                                        \
-        t0 = _mm256_unpacklo_epi64(m0, m3);                     \
-        t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0);                  \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_6_4(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m3, m1);    \
-        t1 = _mm256_blend_epi32(m5, m1, 0x33); \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_7_1(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m6, m3);    \
-        t1 = _mm256_blend_epi32(m1, m6, 0x33); \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_7_2(b0)               \
-    do {                                       \
-        t0 = _mm256_alignr_epi8(m7, m5, 8);    \
-        t1 = _mm256_unpackhi_epi64(m0, m4);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_7_3(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m2, m7);    \
-        t1 = _mm256_unpacklo_epi64(m4, m1);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_7_4(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m0, m2);    \
-        t1 = _mm256_unpacklo_epi64(m3, m5);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_8_1(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m3, m7);    \
-        t1 = _mm256_alignr_epi8(m0, m5, 8);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_8_2(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m7, m4);    \
-        t1 = _mm256_alignr_epi8(m4, m1, 8);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_8_3(b0)               \
-    do {                                       \
-        t0 = m6;                               \
-        t1 = _mm256_alignr_epi8(m5, m0, 8);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_8_4(b0)               \
-    do {                                       \
-        t0 = _mm256_blend_epi32(m3, m1, 0x33); \
-        t1 = m2;                               \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_9_1(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m5, m4);    \
-        t1 = _mm256_unpackhi_epi64(m3, m0);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_9_2(b0)               \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m1, m2);    \
-        t1 = _mm256_blend_epi32(m2, m3, 0x33); \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_9_3(b0)               \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m7, m4);    \
-        t1 = _mm256_unpackhi_epi64(m1, m6);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_9_4(b0)               \
-    do {                                       \
-        t0 = _mm256_alignr_epi8(m7, m5, 8);    \
-        t1 = _mm256_unpacklo_epi64(m6, m0);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_10_1(b0)              \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m0, m1);    \
-        t1 = _mm256_unpacklo_epi64(m2, m3);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_10_2(b0)              \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m0, m1);    \
-        t1 = _mm256_unpackhi_epi64(m2, m3);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_10_3(b0)              \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m4, m5);    \
-        t1 = _mm256_unpacklo_epi64(m6, m7);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_10_4(b0)              \
-    do {                                       \
-        t0 = _mm256_unpackhi_epi64(m4, m5);    \
-        t1 = _mm256_unpackhi_epi64(m6, m7);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_11_1(b0)              \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m7, m2);    \
-        t1 = _mm256_unpackhi_epi64(m4, m6);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_11_2(b0)              \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m5, m4);    \
-        t1 = _mm256_alignr_epi8(m3, m7, 8);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_11_3(b0)                               \
-    do {                                                        \
-        t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
-        t1 = _mm256_unpackhi_epi64(m5, m2);                     \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0);                  \
-    } while (0)
-
-#define BLAKE2B_LOAD_MSG_11_4(b0)              \
-    do {                                       \
-        t0 = _mm256_unpacklo_epi64(m6, m1);    \
-        t1 = _mm256_unpackhi_epi64(m3, m1);    \
-        b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
-    } while (0)
+#define BLAKE2B_LOAD_MSG_0_1(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m0, m1);    \
+    t1 = _mm256_unpacklo_epi64(m2, m3);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_0_2(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m0, m1);    \
+    t1 = _mm256_unpackhi_epi64(m2, m3);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_0_3(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m4, m5);    \
+    t1 = _mm256_unpacklo_epi64(m6, m7);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_0_4(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m4, m5);    \
+    t1 = _mm256_unpackhi_epi64(m6, m7);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_1_1(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m7, m2);    \
+    t1 = _mm256_unpackhi_epi64(m4, m6);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_1_2(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m5, m4);    \
+    t1 = _mm256_alignr_epi8(m3, m7, 8);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_1_3(b0)                            \
+  do                                                        \
+  {                                                         \
+    t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
+    t1 = _mm256_unpackhi_epi64(m5, m2);                     \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);                  \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_1_4(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m6, m1);    \
+    t1 = _mm256_unpackhi_epi64(m3, m1);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_2_1(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_alignr_epi8(m6, m5, 8);    \
+    t1 = _mm256_unpackhi_epi64(m2, m7);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_2_2(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m4, m0);    \
+    t1 = _mm256_blend_epi32(m6, m1, 0x33); \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_2_3(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_blend_epi32(m1, m5, 0x33); \
+    t1 = _mm256_unpackhi_epi64(m3, m4);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_2_4(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m7, m3);    \
+    t1 = _mm256_alignr_epi8(m2, m0, 8);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_3_1(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m3, m1);    \
+    t1 = _mm256_unpackhi_epi64(m6, m5);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_3_2(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m4, m0);    \
+    t1 = _mm256_unpacklo_epi64(m6, m7);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_3_3(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_blend_epi32(m2, m1, 0x33); \
+    t1 = _mm256_blend_epi32(m7, m2, 0x33); \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_3_4(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m3, m5);    \
+    t1 = _mm256_unpacklo_epi64(m0, m4);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_4_1(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m4, m2);    \
+    t1 = _mm256_unpacklo_epi64(m1, m5);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_4_2(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_blend_epi32(m3, m0, 0x33); \
+    t1 = _mm256_blend_epi32(m7, m2, 0x33); \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_4_3(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_blend_epi32(m5, m7, 0x33); \
+    t1 = _mm256_blend_epi32(m1, m3, 0x33); \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_4_4(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_alignr_epi8(m6, m0, 8);    \
+    t1 = _mm256_blend_epi32(m6, m4, 0x33); \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_5_1(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m1, m3);    \
+    t1 = _mm256_unpacklo_epi64(m0, m4);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_5_2(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m6, m5);    \
+    t1 = _mm256_unpackhi_epi64(m5, m1);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_5_3(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_blend_epi32(m3, m2, 0x33); \
+    t1 = _mm256_unpackhi_epi64(m7, m0);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_5_4(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m6, m2);    \
+    t1 = _mm256_blend_epi32(m4, m7, 0x33); \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_6_1(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_blend_epi32(m0, m6, 0x33); \
+    t1 = _mm256_unpacklo_epi64(m7, m2);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_6_2(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m2, m7);    \
+    t1 = _mm256_alignr_epi8(m5, m6, 8);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_6_3(b0)                            \
+  do                                                        \
+  {                                                         \
+    t0 = _mm256_unpacklo_epi64(m0, m3);                     \
+    t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);                  \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_6_4(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m3, m1);    \
+    t1 = _mm256_blend_epi32(m5, m1, 0x33); \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_7_1(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m6, m3);    \
+    t1 = _mm256_blend_epi32(m1, m6, 0x33); \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_7_2(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_alignr_epi8(m7, m5, 8);    \
+    t1 = _mm256_unpackhi_epi64(m0, m4);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_7_3(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m2, m7);    \
+    t1 = _mm256_unpacklo_epi64(m4, m1);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_7_4(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m0, m2);    \
+    t1 = _mm256_unpacklo_epi64(m3, m5);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_8_1(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m3, m7);    \
+    t1 = _mm256_alignr_epi8(m0, m5, 8);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_8_2(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m7, m4);    \
+    t1 = _mm256_alignr_epi8(m4, m1, 8);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_8_3(b0)           \
+  do                                       \
+  {                                        \
+    t0 = m6;                               \
+    t1 = _mm256_alignr_epi8(m5, m0, 8);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_8_4(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_blend_epi32(m3, m1, 0x33); \
+    t1 = m2;                               \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_9_1(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m5, m4);    \
+    t1 = _mm256_unpackhi_epi64(m3, m0);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_9_2(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m1, m2);    \
+    t1 = _mm256_blend_epi32(m2, m3, 0x33); \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_9_3(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m7, m4);    \
+    t1 = _mm256_unpackhi_epi64(m1, m6);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_9_4(b0)           \
+  do                                       \
+  {                                        \
+    t0 = _mm256_alignr_epi8(m7, m5, 8);    \
+    t1 = _mm256_unpacklo_epi64(m6, m0);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_10_1(b0)          \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m0, m1);    \
+    t1 = _mm256_unpacklo_epi64(m2, m3);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_10_2(b0)          \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m0, m1);    \
+    t1 = _mm256_unpackhi_epi64(m2, m3);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_10_3(b0)          \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m4, m5);    \
+    t1 = _mm256_unpacklo_epi64(m6, m7);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_10_4(b0)          \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpackhi_epi64(m4, m5);    \
+    t1 = _mm256_unpackhi_epi64(m6, m7);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_11_1(b0)          \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m7, m2);    \
+    t1 = _mm256_unpackhi_epi64(m4, m6);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_11_2(b0)          \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m5, m4);    \
+    t1 = _mm256_alignr_epi8(m3, m7, 8);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_11_3(b0)                           \
+  do                                                        \
+  {                                                         \
+    t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
+    t1 = _mm256_unpackhi_epi64(m5, m2);                     \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0);                  \
+  } while(0)
+
+#define BLAKE2B_LOAD_MSG_11_4(b0)          \
+  do                                       \
+  {                                        \
+    t0 = _mm256_unpacklo_epi64(m6, m1);    \
+    t1 = _mm256_unpackhi_epi64(m3, m1);    \
+    b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+  } while(0)

 #endif
--- a/crypto/blake2b/blake2b-load-sse2.h
+++ b/crypto/blake2b/blake2b-load-sse2.h
@ -16,149 +16,149 @@
 #ifndef blake2b_load_sse2_H
 #define blake2b_load_sse2_H

-#define LOAD_MSG_0_1(b0, b1)     \
-    b0 = _mm_set_epi64x(m2, m0); \
-    b1 = _mm_set_epi64x(m6, m4)
-#define LOAD_MSG_0_2(b0, b1)     \
-    b0 = _mm_set_epi64x(m3, m1); \
-    b1 = _mm_set_epi64x(m7, m5)
-#define LOAD_MSG_0_3(b0, b1)      \
-    b0 = _mm_set_epi64x(m10, m8); \
-    b1 = _mm_set_epi64x(m14, m12)
-#define LOAD_MSG_0_4(b0, b1)      \
-    b0 = _mm_set_epi64x(m11, m9); \
-    b1 = _mm_set_epi64x(m15, m13)
-#define LOAD_MSG_1_1(b0, b1)      \
-    b0 = _mm_set_epi64x(m4, m14); \
-    b1 = _mm_set_epi64x(m13, m9)
-#define LOAD_MSG_1_2(b0, b1)      \
-    b0 = _mm_set_epi64x(m8, m10); \
-    b1 = _mm_set_epi64x(m6, m15)
-#define LOAD_MSG_1_3(b0, b1)     \
-    b0 = _mm_set_epi64x(m0, m1); \
-    b1 = _mm_set_epi64x(m5, m11)
-#define LOAD_MSG_1_4(b0, b1)      \
-    b0 = _mm_set_epi64x(m2, m12); \
-    b1 = _mm_set_epi64x(m3, m7)
-#define LOAD_MSG_2_1(b0, b1)       \
-    b0 = _mm_set_epi64x(m12, m11); \
-    b1 = _mm_set_epi64x(m15, m5)
-#define LOAD_MSG_2_2(b0, b1)     \
-    b0 = _mm_set_epi64x(m0, m8); \
-    b1 = _mm_set_epi64x(m13, m2)
-#define LOAD_MSG_2_3(b0, b1)      \
-    b0 = _mm_set_epi64x(m3, m10); \
-    b1 = _mm_set_epi64x(m9, m7)
-#define LOAD_MSG_2_4(b0, b1)      \
-    b0 = _mm_set_epi64x(m6, m14); \
-    b1 = _mm_set_epi64x(m4, m1)
-#define LOAD_MSG_3_1(b0, b1)     \
-    b0 = _mm_set_epi64x(m3, m7); \
-    b1 = _mm_set_epi64x(m11, m13)
-#define LOAD_MSG_3_2(b0, b1)     \
-    b0 = _mm_set_epi64x(m1, m9); \
-    b1 = _mm_set_epi64x(m14, m12)
-#define LOAD_MSG_3_3(b0, b1)     \
-    b0 = _mm_set_epi64x(m5, m2); \
-    b1 = _mm_set_epi64x(m15, m4)
-#define LOAD_MSG_3_4(b0, b1)      \
-    b0 = _mm_set_epi64x(m10, m6); \
-    b1 = _mm_set_epi64x(m8, m0)
-#define LOAD_MSG_4_1(b0, b1)     \
-    b0 = _mm_set_epi64x(m5, m9); \
-    b1 = _mm_set_epi64x(m10, m2)
-#define LOAD_MSG_4_2(b0, b1)     \
-    b0 = _mm_set_epi64x(m7, m0); \
-    b1 = _mm_set_epi64x(m15, m4)
-#define LOAD_MSG_4_3(b0, b1)       \
-    b0 = _mm_set_epi64x(m11, m14); \
-    b1 = _mm_set_epi64x(m3, m6)
-#define LOAD_MSG_4_4(b0, b1)      \
-    b0 = _mm_set_epi64x(m12, m1); \
-    b1 = _mm_set_epi64x(m13, m8)
-#define LOAD_MSG_5_1(b0, b1)     \
-    b0 = _mm_set_epi64x(m6, m2); \
-    b1 = _mm_set_epi64x(m8, m0)
-#define LOAD_MSG_5_2(b0, b1)       \
-    b0 = _mm_set_epi64x(m10, m12); \
-    b1 = _mm_set_epi64x(m3, m11)
-#define LOAD_MSG_5_3(b0, b1)     \
-    b0 = _mm_set_epi64x(m7, m4); \
-    b1 = _mm_set_epi64x(m1, m15)
-#define LOAD_MSG_5_4(b0, b1)      \
-    b0 = _mm_set_epi64x(m5, m13); \
-    b1 = _mm_set_epi64x(m9, m14)
-#define LOAD_MSG_6_1(b0, b1)      \
-    b0 = _mm_set_epi64x(m1, m12); \
-    b1 = _mm_set_epi64x(m4, m14)
-#define LOAD_MSG_6_2(b0, b1)      \
-    b0 = _mm_set_epi64x(m15, m5); \
-    b1 = _mm_set_epi64x(m10, m13)
-#define LOAD_MSG_6_3(b0, b1)     \
-    b0 = _mm_set_epi64x(m6, m0); \
-    b1 = _mm_set_epi64x(m8, m9)
-#define LOAD_MSG_6_4(b0, b1)     \
-    b0 = _mm_set_epi64x(m3, m7); \
-    b1 = _mm_set_epi64x(m11, m2)
-#define LOAD_MSG_7_1(b0, b1)      \
-    b0 = _mm_set_epi64x(m7, m13); \
-    b1 = _mm_set_epi64x(m3, m12)
-#define LOAD_MSG_7_2(b0, b1)       \
-    b0 = _mm_set_epi64x(m14, m11); \
-    b1 = _mm_set_epi64x(m9, m1)
-#define LOAD_MSG_7_3(b0, b1)      \
-    b0 = _mm_set_epi64x(m15, m5); \
-    b1 = _mm_set_epi64x(m2, m8)
-#define LOAD_MSG_7_4(b0, b1)     \
-    b0 = _mm_set_epi64x(m4, m0); \
-    b1 = _mm_set_epi64x(m10, m6)
-#define LOAD_MSG_8_1(b0, b1)      \
-    b0 = _mm_set_epi64x(m14, m6); \
-    b1 = _mm_set_epi64x(m0, m11)
-#define LOAD_MSG_8_2(b0, b1)      \
-    b0 = _mm_set_epi64x(m9, m15); \
-    b1 = _mm_set_epi64x(m8, m3)
-#define LOAD_MSG_8_3(b0, b1)       \
-    b0 = _mm_set_epi64x(m13, m12); \
-    b1 = _mm_set_epi64x(m10, m1)
-#define LOAD_MSG_8_4(b0, b1)     \
-    b0 = _mm_set_epi64x(m7, m2); \
-    b1 = _mm_set_epi64x(m5, m4)
-#define LOAD_MSG_9_1(b0, b1)      \
-    b0 = _mm_set_epi64x(m8, m10); \
-    b1 = _mm_set_epi64x(m1, m7)
-#define LOAD_MSG_9_2(b0, b1)     \
-    b0 = _mm_set_epi64x(m4, m2); \
-    b1 = _mm_set_epi64x(m5, m6)
-#define LOAD_MSG_9_3(b0, b1)      \
-    b0 = _mm_set_epi64x(m9, m15); \
-    b1 = _mm_set_epi64x(m13, m3)
-#define LOAD_MSG_9_4(b0, b1)       \
-    b0 = _mm_set_epi64x(m14, m11); \
-    b1 = _mm_set_epi64x(m0, m12)
-#define LOAD_MSG_10_1(b0, b1)    \
-    b0 = _mm_set_epi64x(m2, m0); \
-    b1 = _mm_set_epi64x(m6, m4)
-#define LOAD_MSG_10_2(b0, b1)    \
-    b0 = _mm_set_epi64x(m3, m1); \
-    b1 = _mm_set_epi64x(m7, m5)
-#define LOAD_MSG_10_3(b0, b1)     \
-    b0 = _mm_set_epi64x(m10, m8); \
-    b1 = _mm_set_epi64x(m14, m12)
-#define LOAD_MSG_10_4(b0, b1)     \
-    b0 = _mm_set_epi64x(m11, m9); \
-    b1 = _mm_set_epi64x(m15, m13)
-#define LOAD_MSG_11_1(b0, b1)     \
-    b0 = _mm_set_epi64x(m4, m14); \
-    b1 = _mm_set_epi64x(m13, m9)
-#define LOAD_MSG_11_2(b0, b1)     \
-    b0 = _mm_set_epi64x(m8, m10); \
-    b1 = _mm_set_epi64x(m6, m15)
-#define LOAD_MSG_11_3(b0, b1)    \
-    b0 = _mm_set_epi64x(m0, m1); \
-    b1 = _mm_set_epi64x(m5, m11)
-#define LOAD_MSG_11_4(b0, b1)     \
-    b0 = _mm_set_epi64x(m2, m12); \
-    b1 = _mm_set_epi64x(m3, m7)
+#define LOAD_MSG_0_1(b0, b1)   \
+  b0 = _mm_set_epi64x(m2, m0); \
+  b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_0_2(b0, b1)   \
+  b0 = _mm_set_epi64x(m3, m1); \
+  b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_0_3(b0, b1)    \
+  b0 = _mm_set_epi64x(m10, m8); \
+  b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_0_4(b0, b1)    \
+  b0 = _mm_set_epi64x(m11, m9); \
+  b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_1_1(b0, b1)    \
+  b0 = _mm_set_epi64x(m4, m14); \
+  b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_1_2(b0, b1)    \
+  b0 = _mm_set_epi64x(m8, m10); \
+  b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_1_3(b0, b1)   \
+  b0 = _mm_set_epi64x(m0, m1); \
+  b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_1_4(b0, b1)    \
+  b0 = _mm_set_epi64x(m2, m12); \
+  b1 = _mm_set_epi64x(m3, m7)
+#define LOAD_MSG_2_1(b0, b1)     \
+  b0 = _mm_set_epi64x(m12, m11); \
+  b1 = _mm_set_epi64x(m15, m5)
+#define LOAD_MSG_2_2(b0, b1)   \
+  b0 = _mm_set_epi64x(m0, m8); \
+  b1 = _mm_set_epi64x(m13, m2)
+#define LOAD_MSG_2_3(b0, b1)    \
+  b0 = _mm_set_epi64x(m3, m10); \
+  b1 = _mm_set_epi64x(m9, m7)
+#define LOAD_MSG_2_4(b0, b1)    \
+  b0 = _mm_set_epi64x(m6, m14); \
+  b1 = _mm_set_epi64x(m4, m1)
+#define LOAD_MSG_3_1(b0, b1)   \
+  b0 = _mm_set_epi64x(m3, m7); \
+  b1 = _mm_set_epi64x(m11, m13)
+#define LOAD_MSG_3_2(b0, b1)   \
+  b0 = _mm_set_epi64x(m1, m9); \
+  b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_3_3(b0, b1)   \
+  b0 = _mm_set_epi64x(m5, m2); \
+  b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_3_4(b0, b1)    \
+  b0 = _mm_set_epi64x(m10, m6); \
+  b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_4_1(b0, b1)   \
+  b0 = _mm_set_epi64x(m5, m9); \
+  b1 = _mm_set_epi64x(m10, m2)
+#define LOAD_MSG_4_2(b0, b1)   \
+  b0 = _mm_set_epi64x(m7, m0); \
+  b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_4_3(b0, b1)     \
+  b0 = _mm_set_epi64x(m11, m14); \
+  b1 = _mm_set_epi64x(m3, m6)
+#define LOAD_MSG_4_4(b0, b1)    \
+  b0 = _mm_set_epi64x(m12, m1); \
+  b1 = _mm_set_epi64x(m13, m8)
+#define LOAD_MSG_5_1(b0, b1)   \
+  b0 = _mm_set_epi64x(m6, m2); \
+  b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_5_2(b0, b1)     \
+  b0 = _mm_set_epi64x(m10, m12); \
+  b1 = _mm_set_epi64x(m3, m11)
+#define LOAD_MSG_5_3(b0, b1)   \
+  b0 = _mm_set_epi64x(m7, m4); \
+  b1 = _mm_set_epi64x(m1, m15)
+#define LOAD_MSG_5_4(b0, b1)    \
+  b0 = _mm_set_epi64x(m5, m13); \
+  b1 = _mm_set_epi64x(m9, m14)
+#define LOAD_MSG_6_1(b0, b1)    \
+  b0 = _mm_set_epi64x(m1, m12); \
+  b1 = _mm_set_epi64x(m4, m14)
+#define LOAD_MSG_6_2(b0, b1)    \
+  b0 = _mm_set_epi64x(m15, m5); \
+  b1 = _mm_set_epi64x(m10, m13)
+#define LOAD_MSG_6_3(b0, b1)   \
+  b0 = _mm_set_epi64x(m6, m0); \
+  b1 = _mm_set_epi64x(m8, m9)
+#define LOAD_MSG_6_4(b0, b1)   \
+  b0 = _mm_set_epi64x(m3, m7); \
+  b1 = _mm_set_epi64x(m11, m2)
+#define LOAD_MSG_7_1(b0, b1)    \
+  b0 = _mm_set_epi64x(m7, m13); \
+  b1 = _mm_set_epi64x(m3, m12)
+#define LOAD_MSG_7_2(b0, b1)     \
+  b0 = _mm_set_epi64x(m14, m11); \
+  b1 = _mm_set_epi64x(m9, m1)
+#define LOAD_MSG_7_3(b0, b1)    \
+  b0 = _mm_set_epi64x(m15, m5); \
+  b1 = _mm_set_epi64x(m2, m8)
+#define LOAD_MSG_7_4(b0, b1)   \
+  b0 = _mm_set_epi64x(m4, m0); \
+  b1 = _mm_set_epi64x(m10, m6)
+#define LOAD_MSG_8_1(b0, b1)    \
+  b0 = _mm_set_epi64x(m14, m6); \
+  b1 = _mm_set_epi64x(m0, m11)
+#define LOAD_MSG_8_2(b0, b1)    \
+  b0 = _mm_set_epi64x(m9, m15); \
+  b1 = _mm_set_epi64x(m8, m3)
+#define LOAD_MSG_8_3(b0, b1)     \
+  b0 = _mm_set_epi64x(m13, m12); \
+  b1 = _mm_set_epi64x(m10, m1)
+#define LOAD_MSG_8_4(b0, b1)   \
+  b0 = _mm_set_epi64x(m7, m2); \
+  b1 = _mm_set_epi64x(m5, m4)
+#define LOAD_MSG_9_1(b0, b1)    \
+  b0 = _mm_set_epi64x(m8, m10); \
+  b1 = _mm_set_epi64x(m1, m7)
+#define LOAD_MSG_9_2(b0, b1)   \
+  b0 = _mm_set_epi64x(m4, m2); \
+  b1 = _mm_set_epi64x(m5, m6)
+#define LOAD_MSG_9_3(b0, b1)    \
+  b0 = _mm_set_epi64x(m9, m15); \
+  b1 = _mm_set_epi64x(m13, m3)
+#define LOAD_MSG_9_4(b0, b1)     \
+  b0 = _mm_set_epi64x(m14, m11); \
+  b1 = _mm_set_epi64x(m0, m12)
+#define LOAD_MSG_10_1(b0, b1)  \
+  b0 = _mm_set_epi64x(m2, m0); \
+  b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_10_2(b0, b1)  \
+  b0 = _mm_set_epi64x(m3, m1); \
+  b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_10_3(b0, b1)   \
+  b0 = _mm_set_epi64x(m10, m8); \
+  b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_10_4(b0, b1)   \
+  b0 = _mm_set_epi64x(m11, m9); \
+  b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_11_1(b0, b1)   \
+  b0 = _mm_set_epi64x(m4, m14); \
+  b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_11_2(b0, b1)   \
+  b0 = _mm_set_epi64x(m8, m10); \
+  b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_11_3(b0, b1)  \
+  b0 = _mm_set_epi64x(m0, m1); \
+  b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_11_4(b0, b1)   \
+  b0 = _mm_set_epi64x(m2, m12); \
+  b1 = _mm_set_epi64x(m3, m7)

 #endif
--- a/crypto/blake2b/blake2b-load-sse41.h
+++ b/crypto/blake2b/blake2b-load-sse41.h
@ -16,292 +16,340 @@
 #ifndef blake2b_load_sse41_H
 #define blake2b_load_sse41_H

-#define LOAD_MSG_0_1(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m0, m1); \
-        b1 = _mm_unpacklo_epi64(m2, m3); \
-    } while (0)
-
-#define LOAD_MSG_0_2(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m0, m1); \
-        b1 = _mm_unpackhi_epi64(m2, m3); \
-    } while (0)
-
-#define LOAD_MSG_0_3(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m4, m5); \
-        b1 = _mm_unpacklo_epi64(m6, m7); \
-    } while (0)
-
-#define LOAD_MSG_0_4(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m4, m5); \
-        b1 = _mm_unpackhi_epi64(m6, m7); \
-    } while (0)
-
-#define LOAD_MSG_1_1(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m7, m2); \
-        b1 = _mm_unpackhi_epi64(m4, m6); \
-    } while (0)
-
-#define LOAD_MSG_1_2(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m5, m4); \
-        b1 = _mm_alignr_epi8(m3, m7, 8); \
-    } while (0)
-
-#define LOAD_MSG_1_3(b0, b1)                                 \
-    do {                                                     \
-        b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
-        b1 = _mm_unpackhi_epi64(m5, m2);                     \
-    } while (0)
-
-#define LOAD_MSG_1_4(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m6, m1); \
-        b1 = _mm_unpackhi_epi64(m3, m1); \
-    } while (0)
-
-#define LOAD_MSG_2_1(b0, b1)             \
-    do {                                 \
-        b0 = _mm_alignr_epi8(m6, m5, 8); \
-        b1 = _mm_unpackhi_epi64(m2, m7); \
-    } while (0)
-
-#define LOAD_MSG_2_2(b0, b1)                \
-    do {                                    \
-        b0 = _mm_unpacklo_epi64(m4, m0);    \
-        b1 = _mm_blend_epi16(m1, m6, 0xF0); \
-    } while (0)
-
-#define LOAD_MSG_2_3(b0, b1)                \
-    do {                                    \
-        b0 = _mm_blend_epi16(m5, m1, 0xF0); \
-        b1 = _mm_unpackhi_epi64(m3, m4);    \
-    } while (0)
-
-#define LOAD_MSG_2_4(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m7, m3); \
-        b1 = _mm_alignr_epi8(m2, m0, 8); \
-    } while (0)
-
-#define LOAD_MSG_3_1(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m3, m1); \
-        b1 = _mm_unpackhi_epi64(m6, m5); \
-    } while (0)
-
-#define LOAD_MSG_3_2(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m4, m0); \
-        b1 = _mm_unpacklo_epi64(m6, m7); \
-    } while (0)
-
-#define LOAD_MSG_3_3(b0, b1)                \
-    do {                                    \
-        b0 = _mm_blend_epi16(m1, m2, 0xF0); \
-        b1 = _mm_blend_epi16(m2, m7, 0xF0); \
-    } while (0)
-
-#define LOAD_MSG_3_4(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m3, m5); \
-        b1 = _mm_unpacklo_epi64(m0, m4); \
-    } while (0)
-
-#define LOAD_MSG_4_1(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m4, m2); \
-        b1 = _mm_unpacklo_epi64(m1, m5); \
-    } while (0)
-
-#define LOAD_MSG_4_2(b0, b1)                \
-    do {                                    \
-        b0 = _mm_blend_epi16(m0, m3, 0xF0); \
-        b1 = _mm_blend_epi16(m2, m7, 0xF0); \
-    } while (0)
-
-#define LOAD_MSG_4_3(b0, b1)                \
-    do {                                    \
-        b0 = _mm_blend_epi16(m7, m5, 0xF0); \
-        b1 = _mm_blend_epi16(m3, m1, 0xF0); \
-    } while (0)
-
-#define LOAD_MSG_4_4(b0, b1)                \
-    do {                                    \
-        b0 = _mm_alignr_epi8(m6, m0, 8);    \
-        b1 = _mm_blend_epi16(m4, m6, 0xF0); \
-    } while (0)
-
-#define LOAD_MSG_5_1(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m1, m3); \
-        b1 = _mm_unpacklo_epi64(m0, m4); \
-    } while (0)
-
-#define LOAD_MSG_5_2(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m6, m5); \
-        b1 = _mm_unpackhi_epi64(m5, m1); \
-    } while (0)
-
-#define LOAD_MSG_5_3(b0, b1)                \
-    do {                                    \
-        b0 = _mm_blend_epi16(m2, m3, 0xF0); \
-        b1 = _mm_unpackhi_epi64(m7, m0);    \
-    } while (0)
-
-#define LOAD_MSG_5_4(b0, b1)                \
-    do {                                    \
-        b0 = _mm_unpackhi_epi64(m6, m2);    \
-        b1 = _mm_blend_epi16(m7, m4, 0xF0); \
-    } while (0)
-
-#define LOAD_MSG_6_1(b0, b1)                \
-    do {                                    \
-        b0 = _mm_blend_epi16(m6, m0, 0xF0); \
-        b1 = _mm_unpacklo_epi64(m7, m2);    \
-    } while (0)
-
-#define LOAD_MSG_6_2(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m2, m7); \
-        b1 = _mm_alignr_epi8(m5, m6, 8); \
-    } while (0)
-
-#define LOAD_MSG_6_3(b0, b1)                                 \
-    do {                                                     \
-        b0 = _mm_unpacklo_epi64(m0, m3);                     \
-        b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
-    } while (0)
-
-#define LOAD_MSG_6_4(b0, b1)                \
-    do {                                    \
-        b0 = _mm_unpackhi_epi64(m3, m1);    \
-        b1 = _mm_blend_epi16(m1, m5, 0xF0); \
-    } while (0)
-
-#define LOAD_MSG_7_1(b0, b1)                \
-    do {                                    \
-        b0 = _mm_unpackhi_epi64(m6, m3);    \
-        b1 = _mm_blend_epi16(m6, m1, 0xF0); \
-    } while (0)
-
-#define LOAD_MSG_7_2(b0, b1)             \
-    do {                                 \
-        b0 = _mm_alignr_epi8(m7, m5, 8); \
-        b1 = _mm_unpackhi_epi64(m0, m4); \
-    } while (0)
-
-#define LOAD_MSG_7_3(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m2, m7); \
-        b1 = _mm_unpacklo_epi64(m4, m1); \
-    } while (0)
-
-#define LOAD_MSG_7_4(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m0, m2); \
-        b1 = _mm_unpacklo_epi64(m3, m5); \
-    } while (0)
-
-#define LOAD_MSG_8_1(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m3, m7); \
-        b1 = _mm_alignr_epi8(m0, m5, 8); \
-    } while (0)
-
-#define LOAD_MSG_8_2(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m7, m4); \
-        b1 = _mm_alignr_epi8(m4, m1, 8); \
-    } while (0)
-
-#define LOAD_MSG_8_3(b0, b1)             \
-    do {                                 \
-        b0 = m6;                         \
-        b1 = _mm_alignr_epi8(m5, m0, 8); \
-    } while (0)
-
-#define LOAD_MSG_8_4(b0, b1)                \
-    do {                                    \
-        b0 = _mm_blend_epi16(m1, m3, 0xF0); \
-        b1 = m2;                            \
-    } while (0)
-
-#define LOAD_MSG_9_1(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m5, m4); \
-        b1 = _mm_unpackhi_epi64(m3, m0); \
-    } while (0)
-
-#define LOAD_MSG_9_2(b0, b1)                \
-    do {                                    \
-        b0 = _mm_unpacklo_epi64(m1, m2);    \
-        b1 = _mm_blend_epi16(m3, m2, 0xF0); \
-    } while (0)
-
-#define LOAD_MSG_9_3(b0, b1)             \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m7, m4); \
-        b1 = _mm_unpackhi_epi64(m1, m6); \
-    } while (0)
-
-#define LOAD_MSG_9_4(b0, b1)             \
-    do {                                 \
-        b0 = _mm_alignr_epi8(m7, m5, 8); \
-        b1 = _mm_unpacklo_epi64(m6, m0); \
-    } while (0)
-
-#define LOAD_MSG_10_1(b0, b1)            \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m0, m1); \
-        b1 = _mm_unpacklo_epi64(m2, m3); \
-    } while (0)
-
-#define LOAD_MSG_10_2(b0, b1)            \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m0, m1); \
-        b1 = _mm_unpackhi_epi64(m2, m3); \
-    } while (0)
-
-#define LOAD_MSG_10_3(b0, b1)            \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m4, m5); \
-        b1 = _mm_unpacklo_epi64(m6, m7); \
-    } while (0)
-
-#define LOAD_MSG_10_4(b0, b1)            \
-    do {                                 \
-        b0 = _mm_unpackhi_epi64(m4, m5); \
-        b1 = _mm_unpackhi_epi64(m6, m7); \
-    } while (0)
-
-#define LOAD_MSG_11_1(b0, b1)            \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m7, m2); \
-        b1 = _mm_unpackhi_epi64(m4, m6); \
-    } while (0)
-
-#define LOAD_MSG_11_2(b0, b1)            \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m5, m4); \
-        b1 = _mm_alignr_epi8(m3, m7, 8); \
-    } while (0)
-
-#define LOAD_MSG_11_3(b0, b1)                                \
-    do {                                                     \
-        b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
-        b1 = _mm_unpackhi_epi64(m5, m2);                     \
-    } while (0)
-
-#define LOAD_MSG_11_4(b0, b1)            \
-    do {                                 \
-        b0 = _mm_unpacklo_epi64(m6, m1); \
-        b1 = _mm_unpackhi_epi64(m3, m1); \
-    } while (0)
+#define LOAD_MSG_0_1(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m0, m1); \
+    b1 = _mm_unpacklo_epi64(m2, m3); \
+  } while(0)
+
+#define LOAD_MSG_0_2(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m0, m1); \
+    b1 = _mm_unpackhi_epi64(m2, m3); \
+  } while(0)
+
+#define LOAD_MSG_0_3(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m4, m5); \
+    b1 = _mm_unpacklo_epi64(m6, m7); \
+  } while(0)
+
+#define LOAD_MSG_0_4(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m4, m5); \
+    b1 = _mm_unpackhi_epi64(m6, m7); \
+  } while(0)
+
+#define LOAD_MSG_1_1(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m7, m2); \
+    b1 = _mm_unpackhi_epi64(m4, m6); \
+  } while(0)
+
+#define LOAD_MSG_1_2(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m5, m4); \
+    b1 = _mm_alignr_epi8(m3, m7, 8); \
+  } while(0)
+
+#define LOAD_MSG_1_3(b0, b1)                             \
+  do                                                     \
+  {                                                      \
+    b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
+    b1 = _mm_unpackhi_epi64(m5, m2);                     \
+  } while(0)
+
+#define LOAD_MSG_1_4(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m6, m1); \
+    b1 = _mm_unpackhi_epi64(m3, m1); \
+  } while(0)
+
+#define LOAD_MSG_2_1(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_alignr_epi8(m6, m5, 8); \
+    b1 = _mm_unpackhi_epi64(m2, m7); \
+  } while(0)
+
+#define LOAD_MSG_2_2(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_unpacklo_epi64(m4, m0);    \
+    b1 = _mm_blend_epi16(m1, m6, 0xF0); \
+  } while(0)
+
+#define LOAD_MSG_2_3(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_blend_epi16(m5, m1, 0xF0); \
+    b1 = _mm_unpackhi_epi64(m3, m4);    \
+  } while(0)
+
+#define LOAD_MSG_2_4(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m7, m3); \
+    b1 = _mm_alignr_epi8(m2, m0, 8); \
+  } while(0)
+
+#define LOAD_MSG_3_1(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m3, m1); \
+    b1 = _mm_unpackhi_epi64(m6, m5); \
+  } while(0)
+
+#define LOAD_MSG_3_2(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m4, m0); \
+    b1 = _mm_unpacklo_epi64(m6, m7); \
+  } while(0)
+
+#define LOAD_MSG_3_3(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_blend_epi16(m1, m2, 0xF0); \
+    b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+  } while(0)
+
+#define LOAD_MSG_3_4(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m3, m5); \
+    b1 = _mm_unpacklo_epi64(m0, m4); \
+  } while(0)
+
+#define LOAD_MSG_4_1(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m4, m2); \
+    b1 = _mm_unpacklo_epi64(m1, m5); \
+  } while(0)
+
+#define LOAD_MSG_4_2(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_blend_epi16(m0, m3, 0xF0); \
+    b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+  } while(0)
+
+#define LOAD_MSG_4_3(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_blend_epi16(m7, m5, 0xF0); \
+    b1 = _mm_blend_epi16(m3, m1, 0xF0); \
+  } while(0)
+
+#define LOAD_MSG_4_4(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_alignr_epi8(m6, m0, 8);    \
+    b1 = _mm_blend_epi16(m4, m6, 0xF0); \
+  } while(0)
+
+#define LOAD_MSG_5_1(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m1, m3); \
+    b1 = _mm_unpacklo_epi64(m0, m4); \
+  } while(0)
+
+#define LOAD_MSG_5_2(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m6, m5); \
+    b1 = _mm_unpackhi_epi64(m5, m1); \
+  } while(0)
+
+#define LOAD_MSG_5_3(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_blend_epi16(m2, m3, 0xF0); \
+    b1 = _mm_unpackhi_epi64(m7, m0);    \
+  } while(0)
+
+#define LOAD_MSG_5_4(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_unpackhi_epi64(m6, m2);    \
+    b1 = _mm_blend_epi16(m7, m4, 0xF0); \
+  } while(0)
+
+#define LOAD_MSG_6_1(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_blend_epi16(m6, m0, 0xF0); \
+    b1 = _mm_unpacklo_epi64(m7, m2);    \
+  } while(0)
+
+#define LOAD_MSG_6_2(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m2, m7); \
+    b1 = _mm_alignr_epi8(m5, m6, 8); \
+  } while(0)
+
+#define LOAD_MSG_6_3(b0, b1)                             \
+  do                                                     \
+  {                                                      \
+    b0 = _mm_unpacklo_epi64(m0, m3);                     \
+    b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
+  } while(0)
+
+#define LOAD_MSG_6_4(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_unpackhi_epi64(m3, m1);    \
+    b1 = _mm_blend_epi16(m1, m5, 0xF0); \
+  } while(0)
+
+#define LOAD_MSG_7_1(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_unpackhi_epi64(m6, m3);    \
+    b1 = _mm_blend_epi16(m6, m1, 0xF0); \
+  } while(0)
+
+#define LOAD_MSG_7_2(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_alignr_epi8(m7, m5, 8); \
+    b1 = _mm_unpackhi_epi64(m0, m4); \
+  } while(0)
+
+#define LOAD_MSG_7_3(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m2, m7); \
+    b1 = _mm_unpacklo_epi64(m4, m1); \
+  } while(0)
+
+#define LOAD_MSG_7_4(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m0, m2); \
+    b1 = _mm_unpacklo_epi64(m3, m5); \
+  } while(0)
+
+#define LOAD_MSG_8_1(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m3, m7); \
+    b1 = _mm_alignr_epi8(m0, m5, 8); \
+  } while(0)
+
+#define LOAD_MSG_8_2(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m7, m4); \
+    b1 = _mm_alignr_epi8(m4, m1, 8); \
+  } while(0)
+
+#define LOAD_MSG_8_3(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = m6;                         \
+    b1 = _mm_alignr_epi8(m5, m0, 8); \
+  } while(0)
+
+#define LOAD_MSG_8_4(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_blend_epi16(m1, m3, 0xF0); \
+    b1 = m2;                            \
+  } while(0)
+
+#define LOAD_MSG_9_1(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m5, m4); \
+    b1 = _mm_unpackhi_epi64(m3, m0); \
+  } while(0)
+
+#define LOAD_MSG_9_2(b0, b1)            \
+  do                                    \
+  {                                     \
+    b0 = _mm_unpacklo_epi64(m1, m2);    \
+    b1 = _mm_blend_epi16(m3, m2, 0xF0); \
+  } while(0)
+
+#define LOAD_MSG_9_3(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m7, m4); \
+    b1 = _mm_unpackhi_epi64(m1, m6); \
+  } while(0)
+
+#define LOAD_MSG_9_4(b0, b1)         \
+  do                                 \
+  {                                  \
+    b0 = _mm_alignr_epi8(m7, m5, 8); \
+    b1 = _mm_unpacklo_epi64(m6, m0); \
+  } while(0)
+
+#define LOAD_MSG_10_1(b0, b1)        \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m0, m1); \
+    b1 = _mm_unpacklo_epi64(m2, m3); \
+  } while(0)
+
+#define LOAD_MSG_10_2(b0, b1)        \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m0, m1); \
+    b1 = _mm_unpackhi_epi64(m2, m3); \
+  } while(0)
+
+#define LOAD_MSG_10_3(b0, b1)        \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m4, m5); \
+    b1 = _mm_unpacklo_epi64(m6, m7); \
+  } while(0)
+
+#define LOAD_MSG_10_4(b0, b1)        \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpackhi_epi64(m4, m5); \
+    b1 = _mm_unpackhi_epi64(m6, m7); \
+  } while(0)
+
+#define LOAD_MSG_11_1(b0, b1)        \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m7, m2); \
+    b1 = _mm_unpackhi_epi64(m4, m6); \
+  } while(0)
+
+#define LOAD_MSG_11_2(b0, b1)        \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m5, m4); \
+    b1 = _mm_alignr_epi8(m3, m7, 8); \
+  } while(0)
+
+#define LOAD_MSG_11_3(b0, b1)                            \
+  do                                                     \
+  {                                                      \
+    b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
+    b1 = _mm_unpackhi_epi64(m5, m2);                     \
+  } while(0)
+
+#define LOAD_MSG_11_4(b0, b1)        \
+  do                                 \
+  {                                  \
+    b0 = _mm_unpacklo_epi64(m6, m1); \
+    b1 = _mm_unpackhi_epi64(m3, m1); \
+  } while(0)

 #endif
--- a/crypto/chacha20/dolbeau/chacha20_dolbeau-avx2.c
+++ b/crypto/chacha20/dolbeau/chacha20_dolbeau-avx2.c
@ -24,15 +24,17 @@

 #ifndef __amd64__
 #ifdef __clang__
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
 #else
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __target__("sse2")))
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __target__("sse2")))
 #endif

 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi64_si128(long long __a)
 {
-  return (__m128i){ __a, 0 };
+  return (__m128i){__a, 0};
 }
 #endif

--- a/crypto/chacha20/dolbeau/u0.h
+++ b/crypto/chacha20/dolbeau/u0.h
@ -1,86 +1,89 @@
-if (bytes > 0) {
-    __m128i       x_0, x_1, x_2, x_3;
-    __m128i       t_1;
-    const __m128i rot16 =
-        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
-    const __m128i rot8 =
-        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
-    uint8_t partialblock[64];
-
-    unsigned int i;
-
-    x_0 = _mm_loadu_si128((__m128i*) (x + 0));
-    x_1 = _mm_loadu_si128((__m128i*) (x + 4));
-    x_2 = _mm_loadu_si128((__m128i*) (x + 8));
-    x_3 = _mm_loadu_si128((__m128i*) (x + 12));
-
-    for (i = 0; i < ROUNDS; i += 2) {
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_3 = _mm_shuffle_epi8(x_3, rot16);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_1 = _mm_xor_si128(x_1, x_2);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 12);
-        t_1 = _mm_srli_epi32(t_1, 20);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_0 = _mm_shuffle_epi32(x_0, 0x93);
-        x_3 = _mm_shuffle_epi8(x_3, rot8);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
-        x_1 = _mm_xor_si128(x_1, x_2);
-        x_2 = _mm_shuffle_epi32(x_2, 0x39);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 7);
-        t_1 = _mm_srli_epi32(t_1, 25);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_3 = _mm_shuffle_epi8(x_3, rot16);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_1 = _mm_xor_si128(x_1, x_2);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 12);
-        t_1 = _mm_srli_epi32(t_1, 20);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_0 = _mm_shuffle_epi32(x_0, 0x39);
-        x_3 = _mm_shuffle_epi8(x_3, rot8);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
-        x_1 = _mm_xor_si128(x_1, x_2);
-        x_2 = _mm_shuffle_epi32(x_2, 0x93);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 7);
-        t_1 = _mm_srli_epi32(t_1, 25);
-        x_1 = _mm_xor_si128(x_1, t_1);
-    }
-    x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0)));
-    x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4)));
-    x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8)));
-    x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12)));
-    _mm_storeu_si128((__m128i*) (partialblock + 0), x_0);
-    _mm_storeu_si128((__m128i*) (partialblock + 16), x_1);
-    _mm_storeu_si128((__m128i*) (partialblock + 32), x_2);
-    _mm_storeu_si128((__m128i*) (partialblock + 48), x_3);
-
-    for (i = 0; i < bytes; i++) {
-        c[i] = m[i] ^ partialblock[i];
-    }
-
-    sodium_memzero(partialblock, sizeof partialblock);
+if(bytes > 0)
+{
+  __m128i x_0, x_1, x_2, x_3;
+  __m128i t_1;
+  const __m128i rot16 =
+      _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+  const __m128i rot8 =
+      _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+  uint8_t partialblock[64];
+
+  unsigned int i;
+
+  x_0 = _mm_loadu_si128((__m128i*)(x + 0));
+  x_1 = _mm_loadu_si128((__m128i*)(x + 4));
+  x_2 = _mm_loadu_si128((__m128i*)(x + 8));
+  x_3 = _mm_loadu_si128((__m128i*)(x + 12));
+
+  for(i = 0; i < ROUNDS; i += 2)
+  {
+    x_0 = _mm_add_epi32(x_0, x_1);
+    x_3 = _mm_xor_si128(x_3, x_0);
+    x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+    x_2 = _mm_add_epi32(x_2, x_3);
+    x_1 = _mm_xor_si128(x_1, x_2);
+
+    t_1 = x_1;
+    x_1 = _mm_slli_epi32(x_1, 12);
+    t_1 = _mm_srli_epi32(t_1, 20);
+    x_1 = _mm_xor_si128(x_1, t_1);
+
+    x_0 = _mm_add_epi32(x_0, x_1);
+    x_3 = _mm_xor_si128(x_3, x_0);
+    x_0 = _mm_shuffle_epi32(x_0, 0x93);
+    x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+    x_2 = _mm_add_epi32(x_2, x_3);
+    x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+    x_1 = _mm_xor_si128(x_1, x_2);
+    x_2 = _mm_shuffle_epi32(x_2, 0x39);
+
+    t_1 = x_1;
+    x_1 = _mm_slli_epi32(x_1, 7);
+    t_1 = _mm_srli_epi32(t_1, 25);
+    x_1 = _mm_xor_si128(x_1, t_1);
+
+    x_0 = _mm_add_epi32(x_0, x_1);
+    x_3 = _mm_xor_si128(x_3, x_0);
+    x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+    x_2 = _mm_add_epi32(x_2, x_3);
+    x_1 = _mm_xor_si128(x_1, x_2);
+
+    t_1 = x_1;
+    x_1 = _mm_slli_epi32(x_1, 12);
+    t_1 = _mm_srli_epi32(t_1, 20);
+    x_1 = _mm_xor_si128(x_1, t_1);
+
+    x_0 = _mm_add_epi32(x_0, x_1);
+    x_3 = _mm_xor_si128(x_3, x_0);
+    x_0 = _mm_shuffle_epi32(x_0, 0x39);
+    x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+    x_2 = _mm_add_epi32(x_2, x_3);
+    x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+    x_1 = _mm_xor_si128(x_1, x_2);
+    x_2 = _mm_shuffle_epi32(x_2, 0x93);
+
+    t_1 = x_1;
+    x_1 = _mm_slli_epi32(x_1, 7);
+    t_1 = _mm_srli_epi32(t_1, 25);
+    x_1 = _mm_xor_si128(x_1, t_1);
+  }
+  x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x + 0)));
+  x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x + 4)));
+  x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x + 8)));
+  x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12)));
+  _mm_storeu_si128((__m128i*)(partialblock + 0), x_0);
+  _mm_storeu_si128((__m128i*)(partialblock + 16), x_1);
+  _mm_storeu_si128((__m128i*)(partialblock + 32), x_2);
+  _mm_storeu_si128((__m128i*)(partialblock + 48), x_3);
+
+  for(i = 0; i < bytes; i++)
+  {
+    c[i] = m[i] ^ partialblock[i];
+  }
+
+  sodium_memzero(partialblock, sizeof partialblock);
 }
--- a/crypto/chacha20/dolbeau/u1.h
+++ b/crypto/chacha20/dolbeau/u1.h
@ -1,98 +1,101 @@
-while (bytes >= 64) {
-    __m128i       x_0, x_1, x_2, x_3;
-    __m128i       t_1;
-    const __m128i rot16 =
-        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
-    const __m128i rot8 =
-        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
-
-    uint32_t in12;
-    uint32_t in13;
-    int      i;
-
-    x_0 = _mm_loadu_si128((__m128i*) (x + 0));
-    x_1 = _mm_loadu_si128((__m128i*) (x + 4));
-    x_2 = _mm_loadu_si128((__m128i*) (x + 8));
-    x_3 = _mm_loadu_si128((__m128i*) (x + 12));
-
-    for (i = 0; i < ROUNDS; i += 2) {
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_3 = _mm_shuffle_epi8(x_3, rot16);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_1 = _mm_xor_si128(x_1, x_2);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 12);
-        t_1 = _mm_srli_epi32(t_1, 20);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_0 = _mm_shuffle_epi32(x_0, 0x93);
-        x_3 = _mm_shuffle_epi8(x_3, rot8);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
-        x_1 = _mm_xor_si128(x_1, x_2);
-        x_2 = _mm_shuffle_epi32(x_2, 0x39);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 7);
-        t_1 = _mm_srli_epi32(t_1, 25);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_3 = _mm_shuffle_epi8(x_3, rot16);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_1 = _mm_xor_si128(x_1, x_2);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 12);
-        t_1 = _mm_srli_epi32(t_1, 20);
-        x_1 = _mm_xor_si128(x_1, t_1);
-
-        x_0 = _mm_add_epi32(x_0, x_1);
-        x_3 = _mm_xor_si128(x_3, x_0);
-        x_0 = _mm_shuffle_epi32(x_0, 0x39);
-        x_3 = _mm_shuffle_epi8(x_3, rot8);
-
-        x_2 = _mm_add_epi32(x_2, x_3);
-        x_3 = _mm_shuffle_epi32(x_3, 0x4e);
-        x_1 = _mm_xor_si128(x_1, x_2);
-        x_2 = _mm_shuffle_epi32(x_2, 0x93);
-
-        t_1 = x_1;
-        x_1 = _mm_slli_epi32(x_1, 7);
-        t_1 = _mm_srli_epi32(t_1, 25);
-        x_1 = _mm_xor_si128(x_1, t_1);
-    }
-    x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0)));
-    x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4)));
-    x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8)));
-    x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12)));
-    x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*) (m + 0)));
-    x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*) (m + 16)));
-    x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*) (m + 32)));
-    x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*) (m + 48)));
-    _mm_storeu_si128((__m128i*) (c + 0), x_0);
-    _mm_storeu_si128((__m128i*) (c + 16), x_1);
-    _mm_storeu_si128((__m128i*) (c + 32), x_2);
-    _mm_storeu_si128((__m128i*) (c + 48), x_3);
-
-    in12 = x[12];
-    in13 = x[13];
-    in12++;
-    if (in12 == 0) {
-        in13++;
-    }
-    x[12] = in12;
-    x[13] = in13;
-
-    bytes -= 64;
-    c += 64;
-    m += 64;
+while(bytes >= 64)
+{
+  __m128i x_0, x_1, x_2, x_3;
+  __m128i t_1;
+  const __m128i rot16 =
+      _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+  const __m128i rot8 =
+      _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+
+  uint32_t in12;
+  uint32_t in13;
+  int i;
+
+  x_0 = _mm_loadu_si128((__m128i*)(x + 0));
+  x_1 = _mm_loadu_si128((__m128i*)(x + 4));
+  x_2 = _mm_loadu_si128((__m128i*)(x + 8));
+  x_3 = _mm_loadu_si128((__m128i*)(x + 12));
+
+  for(i = 0; i < ROUNDS; i += 2)
+  {
+    x_0 = _mm_add_epi32(x_0, x_1);
+    x_3 = _mm_xor_si128(x_3, x_0);
+    x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+    x_2 = _mm_add_epi32(x_2, x_3);
+    x_1 = _mm_xor_si128(x_1, x_2);
+
+    t_1 = x_1;
+    x_1 = _mm_slli_epi32(x_1, 12);
+    t_1 = _mm_srli_epi32(t_1, 20);
+    x_1 = _mm_xor_si128(x_1, t_1);
+
+    x_0 = _mm_add_epi32(x_0, x_1);
+    x_3 = _mm_xor_si128(x_3, x_0);
+    x_0 = _mm_shuffle_epi32(x_0, 0x93);
+    x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+    x_2 = _mm_add_epi32(x_2, x_3);
+    x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+    x_1 = _mm_xor_si128(x_1, x_2);
+    x_2 = _mm_shuffle_epi32(x_2, 0x39);
+
+    t_1 = x_1;
+    x_1 = _mm_slli_epi32(x_1, 7);
+    t_1 = _mm_srli_epi32(t_1, 25);
+    x_1 = _mm_xor_si128(x_1, t_1);
+
+    x_0 = _mm_add_epi32(x_0, x_1);
+    x_3 = _mm_xor_si128(x_3, x_0);
+    x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+    x_2 = _mm_add_epi32(x_2, x_3);
+    x_1 = _mm_xor_si128(x_1, x_2);
+
+    t_1 = x_1;
+    x_1 = _mm_slli_epi32(x_1, 12);
+    t_1 = _mm_srli_epi32(t_1, 20);
+    x_1 = _mm_xor_si128(x_1, t_1);
+
+    x_0 = _mm_add_epi32(x_0, x_1);
+    x_3 = _mm_xor_si128(x_3, x_0);
+    x_0 = _mm_shuffle_epi32(x_0, 0x39);
+    x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+    x_2 = _mm_add_epi32(x_2, x_3);
+    x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+    x_1 = _mm_xor_si128(x_1, x_2);
+    x_2 = _mm_shuffle_epi32(x_2, 0x93);
+
+    t_1 = x_1;
+    x_1 = _mm_slli_epi32(x_1, 7);
+    t_1 = _mm_srli_epi32(t_1, 25);
+    x_1 = _mm_xor_si128(x_1, t_1);
+  }
+  x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x + 0)));
+  x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x + 4)));
+  x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x + 8)));
+  x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12)));
+  x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*)(m + 0)));
+  x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*)(m + 16)));
+  x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*)(m + 32)));
+  x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*)(m + 48)));
+  _mm_storeu_si128((__m128i*)(c + 0), x_0);
+  _mm_storeu_si128((__m128i*)(c + 16), x_1);
+  _mm_storeu_si128((__m128i*)(c + 32), x_2);
+  _mm_storeu_si128((__m128i*)(c + 48), x_3);
+
+  in12 = x[12];
+  in13 = x[13];
+  in12++;
+  if(in12 == 0)
+  {
+    in13++;
+  }
+  x[12] = in12;
+  x[13] = in13;
+
+  bytes -= 64;
+  c += 64;
+  m += 64;
 }
--- a/crypto/chacha20/dolbeau/u4.h
+++ b/crypto/chacha20/dolbeau/u4.h
@ -1,174 +1,177 @@

 #define VEC4_ROT(A, IMM) \
-    _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
+  _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))

 /* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
 * 16) (better) */
 #define VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) \
-    x_##A = _mm_add_epi32(x_##A, x_##B);      \
-    t_##A = _mm_xor_si128(x_##D, x_##A);      \
-    x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
-    x_##C = _mm_add_epi32(x_##C, x_##D);      \
-    t_##C = _mm_xor_si128(x_##B, x_##C);      \
-    x_##B = VEC4_ROT(t_##C, 12);              \
-    x_##A = _mm_add_epi32(x_##A, x_##B);      \
-    t_##A = _mm_xor_si128(x_##D, x_##A);      \
-    x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
-    x_##C = _mm_add_epi32(x_##C, x_##D);      \
-    t_##C = _mm_xor_si128(x_##B, x_##C);      \
-    x_##B = VEC4_ROT(t_##C, 7)
+  x_##A = _mm_add_epi32(x_##A, x_##B);        \
+  t_##A = _mm_xor_si128(x_##D, x_##A);        \
+  x_##D = _mm_shuffle_epi8(t_##A, rot16);     \
+  x_##C = _mm_add_epi32(x_##C, x_##D);        \
+  t_##C = _mm_xor_si128(x_##B, x_##C);        \
+  x_##B = VEC4_ROT(t_##C, 12);                \
+  x_##A = _mm_add_epi32(x_##A, x_##B);        \
+  t_##A = _mm_xor_si128(x_##D, x_##A);        \
+  x_##D = _mm_shuffle_epi8(t_##A, rot8);      \
+  x_##C = _mm_add_epi32(x_##C, x_##D);        \
+  t_##C = _mm_xor_si128(x_##B, x_##C);        \
+  x_##B = VEC4_ROT(t_##C, 7)

 #define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)

-if (bytes >= 256) {
-    /* constant for shuffling bytes (replacing multiple-of-8 rotates) */
-    __m128i rot16 =
-        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
-    __m128i rot8 =
-        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
-
-    __m128i x_0  = _mm_set1_epi32(x[0]);
-    __m128i x_1  = _mm_set1_epi32(x[1]);
-    __m128i x_2  = _mm_set1_epi32(x[2]);
-    __m128i x_3  = _mm_set1_epi32(x[3]);
-    __m128i x_4  = _mm_set1_epi32(x[4]);
-    __m128i x_5  = _mm_set1_epi32(x[5]);
-    __m128i x_6  = _mm_set1_epi32(x[6]);
-    __m128i x_7  = _mm_set1_epi32(x[7]);
-    __m128i x_8  = _mm_set1_epi32(x[8]);
-    __m128i x_9  = _mm_set1_epi32(x[9]);
-    __m128i x_10 = _mm_set1_epi32(x[10]);
-    __m128i x_11 = _mm_set1_epi32(x[11]);
-    __m128i x_12;
-    __m128i x_13;
-    __m128i x_14   = _mm_set1_epi32(x[14]);
-    __m128i x_15   = _mm_set1_epi32(x[15]);
-    __m128i orig0  = x_0;
-    __m128i orig1  = x_1;
-    __m128i orig2  = x_2;
-    __m128i orig3  = x_3;
-    __m128i orig4  = x_4;
-    __m128i orig5  = x_5;
-    __m128i orig6  = x_6;
-    __m128i orig7  = x_7;
-    __m128i orig8  = x_8;
-    __m128i orig9  = x_9;
-    __m128i orig10 = x_10;
-    __m128i orig11 = x_11;
-    __m128i orig12;
-    __m128i orig13;
-    __m128i orig14 = x_14;
-    __m128i orig15 = x_15;
-    __m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
-        t_13, t_14, t_15;
-
-    uint32_t in12, in13;
-    int      i;
-
-    while (bytes >= 256) {
-        const __m128i addv12 = _mm_set_epi64x(1, 0);
-        const __m128i addv13 = _mm_set_epi64x(3, 2);
-        __m128i       t12, t13;
-        uint64_t      in1213;
-
-        x_0  = orig0;
-        x_1  = orig1;
-        x_2  = orig2;
-        x_3  = orig3;
-        x_4  = orig4;
-        x_5  = orig5;
-        x_6  = orig6;
-        x_7  = orig7;
-        x_8  = orig8;
-        x_9  = orig9;
-        x_10 = orig10;
-        x_11 = orig11;
-        x_14 = orig14;
-        x_15 = orig15;
-
-        in12   = x[12];
-        in13   = x[13];
-        in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
-        t12    = _mm_set1_epi64x(in1213);
-        t13    = _mm_set1_epi64x(in1213);
-
-        x_12 = _mm_add_epi64(addv12, t12);
-        x_13 = _mm_add_epi64(addv13, t13);
-
-        t12 = _mm_unpacklo_epi32(x_12, x_13);
-        t13 = _mm_unpackhi_epi32(x_12, x_13);
-
-        x_12 = _mm_unpacklo_epi32(t12, t13);
-        x_13 = _mm_unpackhi_epi32(t12, t13);
-
-        orig12 = x_12;
-        orig13 = x_13;
-
-        in1213 += 4;
-
-        x[12] = in1213 & 0xFFFFFFFF;
-        x[13] = (in1213 >> 32) & 0xFFFFFFFF;
-
-        for (i = 0; i < ROUNDS; i += 2) {
-            VEC4_QUARTERROUND(0, 4, 8, 12);
-            VEC4_QUARTERROUND(1, 5, 9, 13);
-            VEC4_QUARTERROUND(2, 6, 10, 14);
-            VEC4_QUARTERROUND(3, 7, 11, 15);
-            VEC4_QUARTERROUND(0, 5, 10, 15);
-            VEC4_QUARTERROUND(1, 6, 11, 12);
-            VEC4_QUARTERROUND(2, 7, 8, 13);
-            VEC4_QUARTERROUND(3, 4, 9, 14);
-        }
-
-#define ONEQUAD_TRANSPOSE(A, B, C, D)                                     \
-    {                                                                     \
-        __m128i t0, t1, t2, t3;                                           \
-                                                                          \
-        x_##A = _mm_add_epi32(x_##A, orig##A);                            \
-        x_##B = _mm_add_epi32(x_##B, orig##B);                            \
-        x_##C = _mm_add_epi32(x_##C, orig##C);                            \
-        x_##D = _mm_add_epi32(x_##D, orig##D);                            \
-        t_##A = _mm_unpacklo_epi32(x_##A, x_##B);                         \
-        t_##B = _mm_unpacklo_epi32(x_##C, x_##D);                         \
-        t_##C = _mm_unpackhi_epi32(x_##A, x_##B);                         \
-        t_##D = _mm_unpackhi_epi32(x_##C, x_##D);                         \
-        x_##A = _mm_unpacklo_epi64(t_##A, t_##B);                         \
-        x_##B = _mm_unpackhi_epi64(t_##A, t_##B);                         \
-        x_##C = _mm_unpacklo_epi64(t_##C, t_##D);                         \
-        x_##D = _mm_unpackhi_epi64(t_##C, t_##D);                         \
-                                                                          \
-        t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((__m128i*) (m + 0)));   \
-        _mm_storeu_si128((__m128i*) (c + 0), t0);                         \
-        t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((__m128i*) (m + 64)));  \
-        _mm_storeu_si128((__m128i*) (c + 64), t1);                        \
-        t2 = _mm_xor_si128(x_##C, _mm_loadu_si128((__m128i*) (m + 128))); \
-        _mm_storeu_si128((__m128i*) (c + 128), t2);                       \
-        t3 = _mm_xor_si128(x_##D, _mm_loadu_si128((__m128i*) (m + 192))); \
-        _mm_storeu_si128((__m128i*) (c + 192), t3);                       \
+if(bytes >= 256)
+{
+  /* constant for shuffling bytes (replacing multiple-of-8 rotates) */
+  __m128i rot16 =
+      _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+  __m128i rot8 =
+      _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+
+  __m128i x_0  = _mm_set1_epi32(x[0]);
+  __m128i x_1  = _mm_set1_epi32(x[1]);
+  __m128i x_2  = _mm_set1_epi32(x[2]);
+  __m128i x_3  = _mm_set1_epi32(x[3]);
+  __m128i x_4  = _mm_set1_epi32(x[4]);
+  __m128i x_5  = _mm_set1_epi32(x[5]);
+  __m128i x_6  = _mm_set1_epi32(x[6]);
+  __m128i x_7  = _mm_set1_epi32(x[7]);
+  __m128i x_8  = _mm_set1_epi32(x[8]);
+  __m128i x_9  = _mm_set1_epi32(x[9]);
+  __m128i x_10 = _mm_set1_epi32(x[10]);
+  __m128i x_11 = _mm_set1_epi32(x[11]);
+  __m128i x_12;
+  __m128i x_13;
+  __m128i x_14   = _mm_set1_epi32(x[14]);
+  __m128i x_15   = _mm_set1_epi32(x[15]);
+  __m128i orig0  = x_0;
+  __m128i orig1  = x_1;
+  __m128i orig2  = x_2;
+  __m128i orig3  = x_3;
+  __m128i orig4  = x_4;
+  __m128i orig5  = x_5;
+  __m128i orig6  = x_6;
+  __m128i orig7  = x_7;
+  __m128i orig8  = x_8;
+  __m128i orig9  = x_9;
+  __m128i orig10 = x_10;
+  __m128i orig11 = x_11;
+  __m128i orig12;
+  __m128i orig13;
+  __m128i orig14 = x_14;
+  __m128i orig15 = x_15;
+  __m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
+      t_13, t_14, t_15;
+
+  uint32_t in12, in13;
+  int i;
+
+  while(bytes >= 256)
+  {
+    const __m128i addv12 = _mm_set_epi64x(1, 0);
+    const __m128i addv13 = _mm_set_epi64x(3, 2);
+    __m128i t12, t13;
+    uint64_t in1213;
+
+    x_0  = orig0;
+    x_1  = orig1;
+    x_2  = orig2;
+    x_3  = orig3;
+    x_4  = orig4;
+    x_5  = orig5;
+    x_6  = orig6;
+    x_7  = orig7;
+    x_8  = orig8;
+    x_9  = orig9;
+    x_10 = orig10;
+    x_11 = orig11;
+    x_14 = orig14;
+    x_15 = orig15;
+
+    in12   = x[12];
+    in13   = x[13];
+    in1213 = ((uint64_t)in12) | (((uint64_t)in13) << 32);
+    t12    = _mm_set1_epi64x(in1213);
+    t13    = _mm_set1_epi64x(in1213);
+
+    x_12 = _mm_add_epi64(addv12, t12);
+    x_13 = _mm_add_epi64(addv13, t13);
+
+    t12 = _mm_unpacklo_epi32(x_12, x_13);
+    t13 = _mm_unpackhi_epi32(x_12, x_13);
+
+    x_12 = _mm_unpacklo_epi32(t12, t13);
+    x_13 = _mm_unpackhi_epi32(t12, t13);
+
+    orig12 = x_12;
+    orig13 = x_13;
+
+    in1213 += 4;
+
+    x[12] = in1213 & 0xFFFFFFFF;
+    x[13] = (in1213 >> 32) & 0xFFFFFFFF;
+
+    for(i = 0; i < ROUNDS; i += 2)
+    {
+      VEC4_QUARTERROUND(0, 4, 8, 12);
+      VEC4_QUARTERROUND(1, 5, 9, 13);
+      VEC4_QUARTERROUND(2, 6, 10, 14);
+      VEC4_QUARTERROUND(3, 7, 11, 15);
+      VEC4_QUARTERROUND(0, 5, 10, 15);
+      VEC4_QUARTERROUND(1, 6, 11, 12);
+      VEC4_QUARTERROUND(2, 7, 8, 13);
+      VEC4_QUARTERROUND(3, 4, 9, 14);
    }

+#define ONEQUAD_TRANSPOSE(A, B, C, D)                                \
+  {                                                                  \
+    __m128i t0, t1, t2, t3;                                          \
+                                                                     \
+    x_##A = _mm_add_epi32(x_##A, orig##A);                           \
+    x_##B = _mm_add_epi32(x_##B, orig##B);                           \
+    x_##C = _mm_add_epi32(x_##C, orig##C);                           \
+    x_##D = _mm_add_epi32(x_##D, orig##D);                           \
+    t_##A = _mm_unpacklo_epi32(x_##A, x_##B);                        \
+    t_##B = _mm_unpacklo_epi32(x_##C, x_##D);                        \
+    t_##C = _mm_unpackhi_epi32(x_##A, x_##B);                        \
+    t_##D = _mm_unpackhi_epi32(x_##C, x_##D);                        \
+    x_##A = _mm_unpacklo_epi64(t_##A, t_##B);                        \
+    x_##B = _mm_unpackhi_epi64(t_##A, t_##B);                        \
+    x_##C = _mm_unpacklo_epi64(t_##C, t_##D);                        \
+    x_##D = _mm_unpackhi_epi64(t_##C, t_##D);                        \
+                                                                     \
+    t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((__m128i*)(m + 0)));   \
+    _mm_storeu_si128((__m128i*)(c + 0), t0);                         \
+    t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((__m128i*)(m + 64)));  \
+    _mm_storeu_si128((__m128i*)(c + 64), t1);                        \
+    t2 = _mm_xor_si128(x_##C, _mm_loadu_si128((__m128i*)(m + 128))); \
+    _mm_storeu_si128((__m128i*)(c + 128), t2);                       \
+    t3 = _mm_xor_si128(x_##D, _mm_loadu_si128((__m128i*)(m + 192))); \
+    _mm_storeu_si128((__m128i*)(c + 192), t3);                       \
+  }
+
 #define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)

-        ONEQUAD(0, 1, 2, 3);
-        m += 16;
-        c += 16;
-        ONEQUAD(4, 5, 6, 7);
-        m += 16;
-        c += 16;
-        ONEQUAD(8, 9, 10, 11);
-        m += 16;
-        c += 16;
-        ONEQUAD(12, 13, 14, 15);
-        m -= 48;
-        c -= 48;
+    ONEQUAD(0, 1, 2, 3);
+    m += 16;
+    c += 16;
+    ONEQUAD(4, 5, 6, 7);
+    m += 16;
+    c += 16;
+    ONEQUAD(8, 9, 10, 11);
+    m += 16;
+    c += 16;
+    ONEQUAD(12, 13, 14, 15);
+    m -= 48;
+    c -= 48;

 #undef ONEQUAD
 #undef ONEQUAD_TRANSPOSE

-        bytes -= 256;
-        c += 256;
-        m += 256;
-    }
+    bytes -= 256;
+    c += 256;
+    m += 256;
+  }
 }
 #undef VEC4_ROT
 #undef VEC4_QUARTERROUND
--- a/crypto/chacha20/dolbeau/u8.h
+++ b/crypto/chacha20/dolbeau/u8.h
@ -1,346 +1,344 @@

 #define VEC8_ROT(A, IMM) \
-    _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
+  _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))

 /* implements a vector quarter round by-the-book (naive!) */
 #define VEC8_QUARTERROUND_NAIVE(A, B, C, D) \
-    x_##A = _mm256_add_epi32(x_##A, x_##B); \
-    t_##A = _mm256_xor_si256(x_##D, x_##A); \
-    x_##D = VEC8_ROT(t_##A, 16);            \
-    x_##C = _mm256_add_epi32(x_##C, x_##D); \
-    t_##C = _mm256_xor_si256(x_##B, x_##C); \
-    x_##B = VEC8_ROT(t_##C, 12);            \
-    x_##A = _mm256_add_epi32(x_##A, x_##B); \
-    t_##A = _mm256_xor_si256(x_##D, x_##A); \
-    x_##D = VEC8_ROT(t_##A, 8);             \
-    x_##C = _mm256_add_epi32(x_##C, x_##D); \
-    t_##C = _mm256_xor_si256(x_##B, x_##C); \
-    x_##B = VEC8_ROT(t_##C, 7)
+  x_##A = _mm256_add_epi32(x_##A, x_##B);   \
+  t_##A = _mm256_xor_si256(x_##D, x_##A);   \
+  x_##D = VEC8_ROT(t_##A, 16);              \
+  x_##C = _mm256_add_epi32(x_##C, x_##D);   \
+  t_##C = _mm256_xor_si256(x_##B, x_##C);   \
+  x_##B = VEC8_ROT(t_##C, 12);              \
+  x_##A = _mm256_add_epi32(x_##A, x_##B);   \
+  t_##A = _mm256_xor_si256(x_##D, x_##A);   \
+  x_##D = VEC8_ROT(t_##A, 8);               \
+  x_##C = _mm256_add_epi32(x_##C, x_##D);   \
+  t_##C = _mm256_xor_si256(x_##B, x_##C);   \
+  x_##B = VEC8_ROT(t_##C, 7)

 /* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
 * 16) (better) */
-#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)  \
-    x_##A = _mm256_add_epi32(x_##A, x_##B);    \
-    t_##A = _mm256_xor_si256(x_##D, x_##A);    \
-    x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
-    x_##C = _mm256_add_epi32(x_##C, x_##D);    \
-    t_##C = _mm256_xor_si256(x_##B, x_##C);    \
-    x_##B = VEC8_ROT(t_##C, 12);               \
-    x_##A = _mm256_add_epi32(x_##A, x_##B);    \
-    t_##A = _mm256_xor_si256(x_##D, x_##A);    \
-    x_##D = _mm256_shuffle_epi8(t_##A, rot8);  \
-    x_##C = _mm256_add_epi32(x_##C, x_##D);    \
-    t_##C = _mm256_xor_si256(x_##B, x_##C);    \
-    x_##B = VEC8_ROT(t_##C, 7)
+#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) \
+  x_##A = _mm256_add_epi32(x_##A, x_##B);     \
+  t_##A = _mm256_xor_si256(x_##D, x_##A);     \
+  x_##D = _mm256_shuffle_epi8(t_##A, rot16);  \
+  x_##C = _mm256_add_epi32(x_##C, x_##D);     \
+  t_##C = _mm256_xor_si256(x_##B, x_##C);     \
+  x_##B = VEC8_ROT(t_##C, 12);                \
+  x_##A = _mm256_add_epi32(x_##A, x_##B);     \
+  t_##A = _mm256_xor_si256(x_##D, x_##A);     \
+  x_##D = _mm256_shuffle_epi8(t_##A, rot8);   \
+  x_##C = _mm256_add_epi32(x_##C, x_##D);     \
+  t_##C = _mm256_xor_si256(x_##B, x_##C);     \
+  x_##B = VEC8_ROT(t_##C, 7)

 /* same, but replace 2 of the shift/shift/or "rotation" by byte & word shuffles
 * (8 & 16) (not as good as previous) */
-#define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D)                                 \
-    x_##A = _mm256_add_epi32(x_##A, x_##B);                                    \
-    t_##A = _mm256_xor_si256(x_##D, x_##A);                                    \
-    x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \
-    x_##C = _mm256_add_epi32(x_##C, x_##D);                                    \
-    t_##C = _mm256_xor_si256(x_##B, x_##C);                                    \
-    x_##B = VEC8_ROT(t_##C, 12);                                               \
-    x_##A = _mm256_add_epi32(x_##A, x_##B);                                    \
-    t_##A = _mm256_xor_si256(x_##D, x_##A);                                    \
-    x_##D = _mm256_shuffle_epi8(t_##A, rot8);                                  \
-    x_##C = _mm256_add_epi32(x_##C, x_##D);                                    \
-    t_##C = _mm256_xor_si256(x_##B, x_##C);                                    \
-    x_##B = VEC8_ROT(t_##C, 7)
+#define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D)                               \
+  x_##A = _mm256_add_epi32(x_##A, x_##B);                                    \
+  t_##A = _mm256_xor_si256(x_##D, x_##A);                                    \
+  x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \
+  x_##C = _mm256_add_epi32(x_##C, x_##D);                                    \
+  t_##C = _mm256_xor_si256(x_##B, x_##C);                                    \
+  x_##B = VEC8_ROT(t_##C, 12);                                               \
+  x_##A = _mm256_add_epi32(x_##A, x_##B);                                    \
+  t_##A = _mm256_xor_si256(x_##D, x_##A);                                    \
+  x_##D = _mm256_shuffle_epi8(t_##A, rot8);                                  \
+  x_##C = _mm256_add_epi32(x_##C, x_##D);                                    \
+  t_##C = _mm256_xor_si256(x_##B, x_##C);                                    \
+  x_##B = VEC8_ROT(t_##C, 7)

 #define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)

-#define VEC8_LINE1(A, B, C, D)              \
-    x_##A = _mm256_add_epi32(x_##A, x_##B); \
-    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
-#define VEC8_LINE2(A, B, C, D)              \
-    x_##C = _mm256_add_epi32(x_##C, x_##D); \
-    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
-#define VEC8_LINE3(A, B, C, D)              \
-    x_##A = _mm256_add_epi32(x_##A, x_##B); \
-    x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
-#define VEC8_LINE4(A, B, C, D)              \
-    x_##C = _mm256_add_epi32(x_##C, x_##D); \
-    x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
+#define VEC8_LINE1(A, B, C, D)            \
+  x_##A = _mm256_add_epi32(x_##A, x_##B); \
+  x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
+#define VEC8_LINE2(A, B, C, D)            \
+  x_##C = _mm256_add_epi32(x_##C, x_##D); \
+  x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
+#define VEC8_LINE3(A, B, C, D)            \
+  x_##A = _mm256_add_epi32(x_##A, x_##B); \
+  x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
+#define VEC8_LINE4(A, B, C, D)            \
+  x_##C = _mm256_add_epi32(x_##C, x_##D); \
+  x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)

 #define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \
                       C4, D4)                                                 \
-    VEC8_LINE1(A1, B1, C1, D1);                                                \
-    VEC8_LINE1(A2, B2, C2, D2);                                                \
-    VEC8_LINE1(A3, B3, C3, D3);                                                \
-    VEC8_LINE1(A4, B4, C4, D4);                                                \
-    VEC8_LINE2(A1, B1, C1, D1);                                                \
-    VEC8_LINE2(A2, B2, C2, D2);                                                \
-    VEC8_LINE2(A3, B3, C3, D3);                                                \
-    VEC8_LINE2(A4, B4, C4, D4);                                                \
-    VEC8_LINE3(A1, B1, C1, D1);                                                \
-    VEC8_LINE3(A2, B2, C2, D2);                                                \
-    VEC8_LINE3(A3, B3, C3, D3);                                                \
-    VEC8_LINE3(A4, B4, C4, D4);                                                \
-    VEC8_LINE4(A1, B1, C1, D1);                                                \
-    VEC8_LINE4(A2, B2, C2, D2);                                                \
-    VEC8_LINE4(A3, B3, C3, D3);                                                \
-    VEC8_LINE4(A4, B4, C4, D4)
+  VEC8_LINE1(A1, B1, C1, D1);                                                  \
+  VEC8_LINE1(A2, B2, C2, D2);                                                  \
+  VEC8_LINE1(A3, B3, C3, D3);                                                  \
+  VEC8_LINE1(A4, B4, C4, D4);                                                  \
+  VEC8_LINE2(A1, B1, C1, D1);                                                  \
+  VEC8_LINE2(A2, B2, C2, D2);                                                  \
+  VEC8_LINE2(A3, B3, C3, D3);                                                  \
+  VEC8_LINE2(A4, B4, C4, D4);                                                  \
+  VEC8_LINE3(A1, B1, C1, D1);                                                  \
+  VEC8_LINE3(A2, B2, C2, D2);                                                  \
+  VEC8_LINE3(A3, B3, C3, D3);                                                  \
+  VEC8_LINE3(A4, B4, C4, D4);                                                  \
+  VEC8_LINE4(A1, B1, C1, D1);                                                  \
+  VEC8_LINE4(A2, B2, C2, D2);                                                  \
+  VEC8_LINE4(A3, B3, C3, D3);                                                  \
+  VEC8_LINE4(A4, B4, C4, D4)

 #define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \
                        B4, C4, D4)                                         \
-    VEC8_LINE1(A1, B1, C1, D1);                                             \
-    VEC8_LINE1(A2, B2, C2, D2);                                             \
-    VEC8_LINE2(A1, B1, C1, D1);                                             \
-    VEC8_LINE2(A2, B2, C2, D2);                                             \
-    VEC8_LINE3(A1, B1, C1, D1);                                             \
-    VEC8_LINE3(A2, B2, C2, D2);                                             \
-    VEC8_LINE4(A1, B1, C1, D1);                                             \
-    VEC8_LINE4(A2, B2, C2, D2);                                             \
-    VEC8_LINE1(A3, B3, C3, D3);                                             \
-    VEC8_LINE1(A4, B4, C4, D4);                                             \
-    VEC8_LINE2(A3, B3, C3, D3);                                             \
-    VEC8_LINE2(A4, B4, C4, D4);                                             \
-    VEC8_LINE3(A3, B3, C3, D3);                                             \
-    VEC8_LINE3(A4, B4, C4, D4);                                             \
-    VEC8_LINE4(A3, B3, C3, D3);                                             \
-    VEC8_LINE4(A4, B4, C4, D4)
+  VEC8_LINE1(A1, B1, C1, D1);                                               \
+  VEC8_LINE1(A2, B2, C2, D2);                                               \
+  VEC8_LINE2(A1, B1, C1, D1);                                               \
+  VEC8_LINE2(A2, B2, C2, D2);                                               \
+  VEC8_LINE3(A1, B1, C1, D1);                                               \
+  VEC8_LINE3(A2, B2, C2, D2);                                               \
+  VEC8_LINE4(A1, B1, C1, D1);                                               \
+  VEC8_LINE4(A2, B2, C2, D2);                                               \
+  VEC8_LINE1(A3, B3, C3, D3);                                               \
+  VEC8_LINE1(A4, B4, C4, D4);                                               \
+  VEC8_LINE2(A3, B3, C3, D3);                                               \
+  VEC8_LINE2(A4, B4, C4, D4);                                               \
+  VEC8_LINE3(A3, B3, C3, D3);                                               \
+  VEC8_LINE3(A4, B4, C4, D4);                                               \
+  VEC8_LINE4(A3, B3, C3, D3);                                               \
+  VEC8_LINE4(A4, B4, C4, D4)

 #define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \
                               A4, B4, C4, D4)                                 \
-    VEC8_LINE1(A1, B1, C1, D1);                                                \
-    VEC8_LINE1(A2, B2, C2, D2);                                                \
-    VEC8_LINE2(A1, B1, C1, D1);                                                \
-    VEC8_LINE2(A2, B2, C2, D2);                                                \
-    VEC8_LINE1(A3, B3, C3, D3);                                                \
-    VEC8_LINE1(A4, B4, C4, D4);                                                \
-    VEC8_LINE2(A3, B3, C3, D3);                                                \
-    VEC8_LINE2(A4, B4, C4, D4);                                                \
-    VEC8_LINE3(A1, B1, C1, D1);                                                \
-    VEC8_LINE3(A2, B2, C2, D2);                                                \
-    VEC8_LINE4(A1, B1, C1, D1);                                                \
-    VEC8_LINE4(A2, B2, C2, D2);                                                \
-    VEC8_LINE3(A3, B3, C3, D3);                                                \
-    VEC8_LINE3(A4, B4, C4, D4);                                                \
-    VEC8_LINE4(A3, B3, C3, D3);                                                \
-    VEC8_LINE4(A4, B4, C4, D4)
+  VEC8_LINE1(A1, B1, C1, D1);                                                  \
+  VEC8_LINE1(A2, B2, C2, D2);                                                  \
+  VEC8_LINE2(A1, B1, C1, D1);                                                  \
+  VEC8_LINE2(A2, B2, C2, D2);                                                  \
+  VEC8_LINE1(A3, B3, C3, D3);                                                  \
+  VEC8_LINE1(A4, B4, C4, D4);                                                  \
+  VEC8_LINE2(A3, B3, C3, D3);                                                  \
+  VEC8_LINE2(A4, B4, C4, D4);                                                  \
+  VEC8_LINE3(A1, B1, C1, D1);                                                  \
+  VEC8_LINE3(A2, B2, C2, D2);                                                  \
+  VEC8_LINE4(A1, B1, C1, D1);                                                  \
+  VEC8_LINE4(A2, B2, C2, D2);                                                  \
+  VEC8_LINE3(A3, B3, C3, D3);                                                  \
+  VEC8_LINE3(A4, B4, C4, D4);                                                  \
+  VEC8_LINE4(A3, B3, C3, D3);                                                  \
+  VEC8_LINE4(A4, B4, C4, D4)

 #define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
                   D4)                                                         \
-    VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
-                   D4)
-
-if (bytes >= 512) {
-    /* constant for shuffling bytes (replacing multiple-of-8 rotates) */
-    __m256i rot16 =
-        _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
-                        13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
-    __m256i rot8 =
-        _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3,
-                        14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
-    uint32_t in12, in13;
-
-    /* the naive way seems as fast (if not a bit faster) than the vector way */
-    __m256i x_0  = _mm256_set1_epi32(x[0]);
-    __m256i x_1  = _mm256_set1_epi32(x[1]);
-    __m256i x_2  = _mm256_set1_epi32(x[2]);
-    __m256i x_3  = _mm256_set1_epi32(x[3]);
-    __m256i x_4  = _mm256_set1_epi32(x[4]);
-    __m256i x_5  = _mm256_set1_epi32(x[5]);
-    __m256i x_6  = _mm256_set1_epi32(x[6]);
-    __m256i x_7  = _mm256_set1_epi32(x[7]);
-    __m256i x_8  = _mm256_set1_epi32(x[8]);
-    __m256i x_9  = _mm256_set1_epi32(x[9]);
-    __m256i x_10 = _mm256_set1_epi32(x[10]);
-    __m256i x_11 = _mm256_set1_epi32(x[11]);
-    __m256i x_12;
-    __m256i x_13;
-    __m256i x_14 = _mm256_set1_epi32(x[14]);
-    __m256i x_15 = _mm256_set1_epi32(x[15]);
-
-    __m256i orig0  = x_0;
-    __m256i orig1  = x_1;
-    __m256i orig2  = x_2;
-    __m256i orig3  = x_3;
-    __m256i orig4  = x_4;
-    __m256i orig5  = x_5;
-    __m256i orig6  = x_6;
-    __m256i orig7  = x_7;
-    __m256i orig8  = x_8;
-    __m256i orig9  = x_9;
-    __m256i orig10 = x_10;
-    __m256i orig11 = x_11;
-    __m256i orig12;
-    __m256i orig13;
-    __m256i orig14 = x_14;
-    __m256i orig15 = x_15;
-    __m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
-        t_13, t_14, t_15;
-
-    while (bytes >= 512) {
-        const __m256i addv12  = _mm256_set_epi64x(3, 2, 1, 0);
-        const __m256i addv13  = _mm256_set_epi64x(7, 6, 5, 4);
-        const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
-        __m256i       t12, t13;
-
-        uint64_t in1213;
-        int      i;
-
-        x_0  = orig0;
-        x_1  = orig1;
-        x_2  = orig2;
-        x_3  = orig3;
-        x_4  = orig4;
-        x_5  = orig5;
-        x_6  = orig6;
-        x_7  = orig7;
-        x_8  = orig8;
-        x_9  = orig9;
-        x_10 = orig10;
-        x_11 = orig11;
-        x_14 = orig14;
-        x_15 = orig15;
-
-        in12   = x[12];
-        in13   = x[13];
-        in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
-        x_12   = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
-
-        t12 = _mm256_add_epi64(addv12, x_12);
-        t13 = _mm256_add_epi64(addv13, x_13);
-
-        x_12 = _mm256_unpacklo_epi32(t12, t13);
-        x_13 = _mm256_unpackhi_epi32(t12, t13);
-
-        t12 = _mm256_unpacklo_epi32(x_12, x_13);
-        t13 = _mm256_unpackhi_epi32(x_12, x_13);
-
-        /* required because unpack* are intra-lane */
-        x_12 = _mm256_permutevar8x32_epi32(t12, permute);
-        x_13 = _mm256_permutevar8x32_epi32(t13, permute);
-
-        orig12 = x_12;
-        orig13 = x_13;
-
-        in1213 += 8;
-
-        x[12] = in1213 & 0xFFFFFFFF;
-        x[13] = (in1213 >> 32) & 0xFFFFFFFF;
-
-        for (i = 0; i < ROUNDS; i += 2) {
-            VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-            VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
-        }
-
-#define ONEQUAD_TRANSPOSE(A, B, C, D)                              \
-    {                                                              \
-        __m128i t0, t1, t2, t3;                                    \
-        x_##A = _mm256_add_epi32(x_##A, orig##A);                  \
-        x_##B = _mm256_add_epi32(x_##B, orig##B);                  \
-        x_##C = _mm256_add_epi32(x_##C, orig##C);                  \
-        x_##D = _mm256_add_epi32(x_##D, orig##D);                  \
-        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B);               \
-        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D);               \
-        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B);               \
-        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D);               \
-        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B);               \
-        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B);               \
-        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D);               \
-        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D);               \
-        t0    = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0),  \
-                           _mm_loadu_si128((__m128i*) (m + 0)));   \
-        _mm_storeu_si128((__m128i*) (c + 0), t0);                  \
-        t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0),     \
-                           _mm_loadu_si128((__m128i*) (m + 64)));  \
-        _mm_storeu_si128((__m128i*) (c + 64), t1);                 \
-        t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0),     \
-                           _mm_loadu_si128((__m128i*) (m + 128))); \
-        _mm_storeu_si128((__m128i*) (c + 128), t2);                \
-        t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0),     \
-                           _mm_loadu_si128((__m128i*) (m + 192))); \
-        _mm_storeu_si128((__m128i*) (c + 192), t3);                \
-        t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1),     \
-                           _mm_loadu_si128((__m128i*) (m + 256))); \
-        _mm_storeu_si128((__m128i*) (c + 256), t0);                \
-        t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1),     \
-                           _mm_loadu_si128((__m128i*) (m + 320))); \
-        _mm_storeu_si128((__m128i*) (c + 320), t1);                \
-        t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1),     \
-                           _mm_loadu_si128((__m128i*) (m + 384))); \
-        _mm_storeu_si128((__m128i*) (c + 384), t2);                \
-        t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1),     \
-                           _mm_loadu_si128((__m128i*) (m + 448))); \
-        _mm_storeu_si128((__m128i*) (c + 448), t3);                \
+  VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, D4)
+
+if(bytes >= 512)
+{
+  /* constant for shuffling bytes (replacing multiple-of-8 rotates) */
+  __m256i rot16 =
+      _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13,
+                      12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+  __m256i rot8 =
+      _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14,
+                      13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+  uint32_t in12, in13;
+
+  /* the naive way seems as fast (if not a bit faster) than the vector way */
+  __m256i x_0  = _mm256_set1_epi32(x[0]);
+  __m256i x_1  = _mm256_set1_epi32(x[1]);
+  __m256i x_2  = _mm256_set1_epi32(x[2]);
+  __m256i x_3  = _mm256_set1_epi32(x[3]);
+  __m256i x_4  = _mm256_set1_epi32(x[4]);
+  __m256i x_5  = _mm256_set1_epi32(x[5]);
+  __m256i x_6  = _mm256_set1_epi32(x[6]);
+  __m256i x_7  = _mm256_set1_epi32(x[7]);
+  __m256i x_8  = _mm256_set1_epi32(x[8]);
+  __m256i x_9  = _mm256_set1_epi32(x[9]);
+  __m256i x_10 = _mm256_set1_epi32(x[10]);
+  __m256i x_11 = _mm256_set1_epi32(x[11]);
+  __m256i x_12;
+  __m256i x_13;
+  __m256i x_14 = _mm256_set1_epi32(x[14]);
+  __m256i x_15 = _mm256_set1_epi32(x[15]);
+
+  __m256i orig0  = x_0;
+  __m256i orig1  = x_1;
+  __m256i orig2  = x_2;
+  __m256i orig3  = x_3;
+  __m256i orig4  = x_4;
+  __m256i orig5  = x_5;
+  __m256i orig6  = x_6;
+  __m256i orig7  = x_7;
+  __m256i orig8  = x_8;
+  __m256i orig9  = x_9;
+  __m256i orig10 = x_10;
+  __m256i orig11 = x_11;
+  __m256i orig12;
+  __m256i orig13;
+  __m256i orig14 = x_14;
+  __m256i orig15 = x_15;
+  __m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
+      t_13, t_14, t_15;
+
+  while(bytes >= 512)
+  {
+    const __m256i addv12  = _mm256_set_epi64x(3, 2, 1, 0);
+    const __m256i addv13  = _mm256_set_epi64x(7, 6, 5, 4);
+    const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    __m256i t12, t13;
+
+    uint64_t in1213;
+    int i;
+
+    x_0  = orig0;
+    x_1  = orig1;
+    x_2  = orig2;
+    x_3  = orig3;
+    x_4  = orig4;
+    x_5  = orig5;
+    x_6  = orig6;
+    x_7  = orig7;
+    x_8  = orig8;
+    x_9  = orig9;
+    x_10 = orig10;
+    x_11 = orig11;
+    x_14 = orig14;
+    x_15 = orig15;
+
+    in12   = x[12];
+    in13   = x[13];
+    in1213 = ((uint64_t)in12) | (((uint64_t)in13) << 32);
+    x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
+
+    t12 = _mm256_add_epi64(addv12, x_12);
+    t13 = _mm256_add_epi64(addv13, x_13);
+
+    x_12 = _mm256_unpacklo_epi32(t12, t13);
+    x_13 = _mm256_unpackhi_epi32(t12, t13);
+
+    t12 = _mm256_unpacklo_epi32(x_12, x_13);
+    t13 = _mm256_unpackhi_epi32(x_12, x_13);
+
+    /* required because unpack* are intra-lane */
+    x_12 = _mm256_permutevar8x32_epi32(t12, permute);
+    x_13 = _mm256_permutevar8x32_epi32(t13, permute);
+
+    orig12 = x_12;
+    orig13 = x_13;
+
+    in1213 += 8;
+
+    x[12] = in1213 & 0xFFFFFFFF;
+    x[13] = (in1213 >> 32) & 0xFFFFFFFF;
+
+    for(i = 0; i < ROUNDS; i += 2)
+    {
+      VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+      VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
    }

+#define ONEQUAD_TRANSPOSE(A, B, C, D)                          \
+  {                                                            \
+    __m128i t0, t1, t2, t3;                                    \
+    x_##A = _mm256_add_epi32(x_##A, orig##A);                  \
+    x_##B = _mm256_add_epi32(x_##B, orig##B);                  \
+    x_##C = _mm256_add_epi32(x_##C, orig##C);                  \
+    x_##D = _mm256_add_epi32(x_##D, orig##D);                  \
+    t_##A = _mm256_unpacklo_epi32(x_##A, x_##B);               \
+    t_##B = _mm256_unpacklo_epi32(x_##C, x_##D);               \
+    t_##C = _mm256_unpackhi_epi32(x_##A, x_##B);               \
+    t_##D = _mm256_unpackhi_epi32(x_##C, x_##D);               \
+    x_##A = _mm256_unpacklo_epi64(t_##A, t_##B);               \
+    x_##B = _mm256_unpackhi_epi64(t_##A, t_##B);               \
+    x_##C = _mm256_unpacklo_epi64(t_##C, t_##D);               \
+    x_##D = _mm256_unpackhi_epi64(t_##C, t_##D);               \
+    t0    = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0),  \
+                       _mm_loadu_si128((__m128i*)(m + 0))); \
+    _mm_storeu_si128((__m128i*)(c + 0), t0);                   \
+    t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0),     \
+                       _mm_loadu_si128((__m128i*)(m + 64)));   \
+    _mm_storeu_si128((__m128i*)(c + 64), t1);                  \
+    t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0),     \
+                       _mm_loadu_si128((__m128i*)(m + 128)));  \
+    _mm_storeu_si128((__m128i*)(c + 128), t2);                 \
+    t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0),     \
+                       _mm_loadu_si128((__m128i*)(m + 192)));  \
+    _mm_storeu_si128((__m128i*)(c + 192), t3);                 \
+    t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1),     \
+                       _mm_loadu_si128((__m128i*)(m + 256)));  \
+    _mm_storeu_si128((__m128i*)(c + 256), t0);                 \
+    t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1),     \
+                       _mm_loadu_si128((__m128i*)(m + 320)));  \
+    _mm_storeu_si128((__m128i*)(c + 320), t1);                 \
+    t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1),     \
+                       _mm_loadu_si128((__m128i*)(m + 384)));  \
+    _mm_storeu_si128((__m128i*)(c + 384), t2);                 \
+    t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1),     \
+                       _mm_loadu_si128((__m128i*)(m + 448)));  \
+    _mm_storeu_si128((__m128i*)(c + 448), t3);                 \
+  }
+
 #define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)

-#define ONEQUAD_UNPCK(A, B, C, D)                    \
-    {                                                \
-        x_##A = _mm256_add_epi32(x_##A, orig##A);    \
-        x_##B = _mm256_add_epi32(x_##B, orig##B);    \
-        x_##C = _mm256_add_epi32(x_##C, orig##C);    \
-        x_##D = _mm256_add_epi32(x_##D, orig##D);    \
-        t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
-        t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
-        t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
-        t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
-        x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
-        x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
-        x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
-        x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
-    }
+#define ONEQUAD_UNPCK(A, B, C, D)                \
+  {                                              \
+    x_##A = _mm256_add_epi32(x_##A, orig##A);    \
+    x_##B = _mm256_add_epi32(x_##B, orig##B);    \
+    x_##C = _mm256_add_epi32(x_##C, orig##C);    \
+    x_##D = _mm256_add_epi32(x_##D, orig##D);    \
+    t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
+    t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
+    t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
+    t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
+    x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
+    x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
+    x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
+    x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
+  }

 #define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                                    \
-    {                                                                          \
-        ONEQUAD_UNPCK(A, B, C, D);                                             \
-        ONEQUAD_UNPCK(A2, B2, C2, D2);                                         \
-        t_##A  = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20);               \
-        t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31);               \
-        t_##B  = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20);               \
-        t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31);               \
-        t_##C  = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20);               \
-        t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31);               \
-        t_##D  = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20);               \
-        t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31);               \
-        t_##A =                                                                \
-            _mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*) (m + 0)));   \
-        t_##B =                                                                \
-            _mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*) (m + 64)));  \
-        t_##C =                                                                \
-            _mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*) (m + 128))); \
-        t_##D =                                                                \
-            _mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*) (m + 192))); \
-        t_##A2 = _mm256_xor_si256(t_##A2,                                      \
-                                  _mm256_loadu_si256((__m256i*) (m + 256)));   \
-        t_##B2 = _mm256_xor_si256(t_##B2,                                      \
-                                  _mm256_loadu_si256((__m256i*) (m + 320)));   \
-        t_##C2 = _mm256_xor_si256(t_##C2,                                      \
-                                  _mm256_loadu_si256((__m256i*) (m + 384)));   \
-        t_##D2 = _mm256_xor_si256(t_##D2,                                      \
-                                  _mm256_loadu_si256((__m256i*) (m + 448)));   \
-        _mm256_storeu_si256((__m256i*) (c + 0), t_##A);                        \
-        _mm256_storeu_si256((__m256i*) (c + 64), t_##B);                       \
-        _mm256_storeu_si256((__m256i*) (c + 128), t_##C);                      \
-        _mm256_storeu_si256((__m256i*) (c + 192), t_##D);                      \
-        _mm256_storeu_si256((__m256i*) (c + 256), t_##A2);                     \
-        _mm256_storeu_si256((__m256i*) (c + 320), t_##B2);                     \
-        _mm256_storeu_si256((__m256i*) (c + 384), t_##C2);                     \
-        _mm256_storeu_si256((__m256i*) (c + 448), t_##D2);                     \
-    }
-
-        ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
-        m += 32;
-        c += 32;
-        ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
-        m -= 32;
-        c -= 32;
+  {                                                                            \
+    ONEQUAD_UNPCK(A, B, C, D);                                                 \
+    ONEQUAD_UNPCK(A2, B2, C2, D2);                                             \
+    t_##A  = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20);                   \
+    t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31);                   \
+    t_##B  = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20);                   \
+    t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31);                   \
+    t_##C  = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20);                   \
+    t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31);                   \
+    t_##D  = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20);                   \
+    t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31);                   \
+    t_##A  = _mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*)(m + 0)));   \
+    t_##B  = _mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*)(m + 64)));  \
+    t_##C  = _mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*)(m + 128))); \
+    t_##D  = _mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*)(m + 192))); \
+    t_##A2 =                                                                   \
+        _mm256_xor_si256(t_##A2, _mm256_loadu_si256((__m256i*)(m + 256)));     \
+    t_##B2 =                                                                   \
+        _mm256_xor_si256(t_##B2, _mm256_loadu_si256((__m256i*)(m + 320)));     \
+    t_##C2 =                                                                   \
+        _mm256_xor_si256(t_##C2, _mm256_loadu_si256((__m256i*)(m + 384)));     \
+    t_##D2 =                                                                   \
+        _mm256_xor_si256(t_##D2, _mm256_loadu_si256((__m256i*)(m + 448)));     \
+    _mm256_storeu_si256((__m256i*)(c + 0), t_##A);                             \
+    _mm256_storeu_si256((__m256i*)(c + 64), t_##B);                            \
+    _mm256_storeu_si256((__m256i*)(c + 128), t_##C);                           \
+    _mm256_storeu_si256((__m256i*)(c + 192), t_##D);                           \
+    _mm256_storeu_si256((__m256i*)(c + 256), t_##A2);                          \
+    _mm256_storeu_si256((__m256i*)(c + 320), t_##B2);                          \
+    _mm256_storeu_si256((__m256i*)(c + 384), t_##C2);                          \
+    _mm256_storeu_si256((__m256i*)(c + 448), t_##D2);                          \
+  }
+
+    ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
+    m += 32;
+    c += 32;
+    ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
+    m -= 32;
+    c -= 32;

 #undef ONEQUAD
 #undef ONEQUAD_TRANSPOSE
 #undef ONEQUAD_UNPCK
 #undef ONEOCTO

-        bytes -= 512;
-        c += 512;
-        m += 512;
-    }
+    bytes -= 512;
+    c += 512;
+    m += 512;
+  }
 }
 #undef VEC8_ROT
 #undef VEC8_QUARTERROUND
--- a/crypto/chacha20/stream_chacha20.h
+++ b/crypto/chacha20/stream_chacha20.h
@ -4,19 +4,18 @@

 #include <stdint.h>

-typedef struct crypto_stream_chacha20_implementation {
-    int (*stream)(unsigned char *c, unsigned long long clen,
-                  const unsigned char *n, const unsigned char *k);
-    int (*stream_ietf)(unsigned char *c, unsigned long long clen,
-                      const unsigned char *n, const unsigned char *k);
-    int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
-                         unsigned long long mlen,
-                         const unsigned char *n, uint64_t ic,
-                         const unsigned char *k);
-    int (*stream_ietf_xor_ic)(unsigned char *c, const unsigned char *m,
-                              unsigned long long mlen,
-                              const unsigned char *n, uint32_t ic,
-                              const unsigned char *k);
+typedef struct crypto_stream_chacha20_implementation
+{
+  int (*stream)(unsigned char *c, unsigned long long clen,
+                const unsigned char *n, const unsigned char *k);
+  int (*stream_ietf)(unsigned char *c, unsigned long long clen,
+                     const unsigned char *n, const unsigned char *k);
+  int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
+                       unsigned long long mlen, const unsigned char *n,
+                       uint64_t ic, const unsigned char *k);
+  int (*stream_ietf_xor_ic)(unsigned char *c, const unsigned char *m,
+                            unsigned long long mlen, const unsigned char *n,
+                            uint32_t ic, const unsigned char *k);
 } crypto_stream_chacha20_implementation;

 #endif
--- a/crypto/csrng/randombytes.c
+++ b/crypto/csrng/randombytes.c
@ -74,8 +74,8 @@ randombytes_stir(void)
      try
      {
        var window_ = 'object' == = typeof window ? window : self;
-        var crypto_ = typeof window_.crypto != = 'undefined' ? window_.crypto
-                                                             : window_.msCrypto;
+        var crypto_               = typeof window_.crypto != =
+            'undefined' ? window_.crypto : window_.msCrypto;
        var randomValuesStandard = function()
        {
          var buf = new Uint32Array(1);
--- a/crypto/csrng/randombytes_salsa20_random.c
+++ b/crypto/csrng/randombytes_salsa20_random.c
@ -69,7 +69,7 @@ typedef NTSTATUS(FAR PASCAL *CNGAPI_DRBG)(BCRYPT_ALG_HANDLE, UCHAR *, ULONG,
 #endif

 #ifndef TLS
-#ifdef _WIN32 
+#ifdef _WIN32
 #ifdef _MSC_VER
 #define TLS __declspec(thread)
 #else
--- a/crypto/curve25519/ref10/fe_25_5/base.h
+++ b/crypto/curve25519/ref10/fe_25_5/base.h
--- a/crypto/curve25519/ref10/fe_25_5/base2.h
+++ b/crypto/curve25519/ref10/fe_25_5/base2.h
@ -1,40 +1,52 @@
-{
-  { 25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626, -11754271, -6079156, 2047605 },
-  { -12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692, 5043384, 19500929, -15469378 },
-  { -8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919, 11864899, -24514362, -4438546 }
-},
-{
-  { 15636291, -9688557, 24204773, -7912398, 616977, -16685262, 27787600, -14772189, 28944400, -1550024 },
-  { 16568933, 4717097, -11556148, -1102322, 15682896, -11807043, 16354577, -11775962, 7689662, 11199574 },
-  { 30464156, -5976125, -11779434, -15670865, 23220365, 15915852, 7512774, 10017326, -17749093, -9920357 }
-},
-{
-  { 10861363, 11473154, 27284546, 1981175, -30064349, 12577861, 32867885, 14515107, -15438304, 10819380 },
-  { 4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668, 12483688, -12668491, 5581306 },
-  { 19563160, 16186464, -29386857, 4097519, 10237984, -4348115, 28542350, 13850243, -23678021, -15815942 }
-},
-{
-  { 5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852, 5230134, -23952439, -15175766 },
-  { -30269007, -3463509, 7665486, 10083793, 28475525, 1649722, 20654025, 16520125, 30598449, 7715701 },
-  { 28881845, 14381568, 9657904, 3680757, -20181635, 7843316, -31400660, 1370708, 29794553, -1409300 }
-},
-{
-  { -22518993, -6692182, 14201702, -8745502, -23510406, 8844726, 18474211, -1361450, -13062696, 13821877 },
-  { -6455177, -7839871, 3374702, -4740862, -27098617, -10571707, 31655028, -7212327, 18853322, -14220951 },
-  { 4566830, -12963868, -28974889, -12240689, -7602672, -2830569, -8514358, -10431137, 2207753, -3209784 }
-},
-{
-  { -25154831, -4185821, 29681144, 7868801, -6854661, -9423865, -12437364, -663000, -31111463, -16132436 },
-  { 25576264, -2703214, 7349804, -11814844, 16472782, 9300885, 3844789, 15725684, 171356, 6466918 },
-  { 23103977, 13316479, 9739013, -16149481, 817875, -15038942, 8965339, -14088058, -30714912, 16193877 }
-},
-{
-  { -33521811, 3180713, -2394130, 14003687, -16903474, -16270840, 17238398, 4729455, -18074513, 9256800 },
-  { -25182317, -4174131, 32336398, 5036987, -21236817, 11360617, 22616405, 9761698, -19827198, 630305 },
-  { -13720693, 2639453, -24237460, -7406481, 9494427, -5774029, -6554551, -15960994, -2449256, -14291300 }
-},
-{
-  { -3151181, -5046075, 9282714, 6866145, -31907062, -863023, -18940575, 15033784, 25105118, -7894876 },
-  { -24326370, 15950226, -31801215, -14592823, -11662737, -5090925, 1573892, -2625887, 2198790, -15804619 },
-  { -3099351, 10324967, -2241613, 7453183, -5446979, -2735503, -13812022, -16236442, -32461234, -12290683 }
+{{25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626,
+  -11754271, -6079156, 2047605},
+ {-12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692, 5043384,
+  19500929, -15469378},
+ {-8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919,
+  11864899, -24514362, -4438546}},
+    {{15636291, -9688557, 24204773, -7912398, 616977, -16685262, 27787600,
+      -14772189, 28944400, -1550024},
+     {16568933, 4717097, -11556148, -1102322, 15682896, -11807043, 16354577,
+      -11775962, 7689662, 11199574},
+     {30464156, -5976125, -11779434, -15670865, 23220365, 15915852, 7512774,
+      10017326, -17749093, -9920357}},
+    {{10861363, 11473154, 27284546, 1981175, -30064349, 12577861, 32867885,
+      14515107, -15438304, 10819380},
+     {4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668,
+      12483688, -12668491, 5581306},
+     {19563160, 16186464, -29386857, 4097519, 10237984, -4348115, 28542350,
+      13850243, -23678021, -15815942}},
+    {{5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852, 5230134,
+      -23952439, -15175766},
+     {-30269007, -3463509, 7665486, 10083793, 28475525, 1649722, 20654025,
+      16520125, 30598449, 7715701},
+     {28881845, 14381568, 9657904, 3680757, -20181635, 7843316, -31400660,
+      1370708, 29794553, -1409300}},
+    {{-22518993, -6692182, 14201702, -8745502, -23510406, 8844726, 18474211,
+      -1361450, -13062696, 13821877},
+     {-6455177, -7839871, 3374702, -4740862, -27098617, -10571707, 31655028,
+      -7212327, 18853322, -14220951},
+     {4566830, -12963868, -28974889, -12240689, -7602672, -2830569, -8514358,
+      -10431137, 2207753, -3209784}},
+    {{-25154831, -4185821, 29681144, 7868801, -6854661, -9423865, -12437364,
+      -663000, -31111463, -16132436},
+     {25576264, -2703214, 7349804, -11814844, 16472782, 9300885, 3844789,
+      15725684, 171356, 6466918},
+     {23103977, 13316479, 9739013, -16149481, 817875, -15038942, 8965339,
+      -14088058, -30714912, 16193877}},
+    {{-33521811, 3180713, -2394130, 14003687, -16903474, -16270840, 17238398,
+      4729455, -18074513, 9256800},
+     {-25182317, -4174131, 32336398, 5036987, -21236817, 11360617, 22616405,
+      9761698, -19827198, 630305},
+     {-13720693, 2639453, -24237460, -7406481, 9494427, -5774029, -6554551,
+      -15960994, -2449256, -14291300}},
+{
+  {-3151181, -5046075,  9282714,  6866145,  -31907062,
+   -863023,  -18940575, 15033784, 25105118, -7894876},
+      {-24326370, 15950226, -31801215, -14592823, -11662737,
+       -5090925,  1573892,  -2625887,  2198790,   -15804619},
+  {
+    -3099351, 10324967, -2241613, 7453183, -5446979, -2735503, -13812022,
+        -16236442, -32461234, -12290683
+  }
 }
--- a/crypto/curve25519/ref10/fe_25_5/constants.h
+++ b/crypto/curve25519/ref10/fe_25_5/constants.h
@ -1,20 +1,18 @@
-/* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */
-static const fe25519 d = {
-    -10913610, 13857413, -15372611, 6949391,   114729, -8787816, -6275908, -3247719, -18696448, -12055116
-};
+/* 37095705934669439343138083508754565189542113879843219016388785533085940283555
+ */
+static const fe25519 d = {-10913610, 13857413, -15372611, 6949391,   114729,
+                          -8787816,  -6275908, -3247719,  -18696448, -12055116};

 /* 2 * d =
 * 16295367250680780974490674513165176452449235426866156013048779062215315747161
 */
-static const fe25519 d2 = {
-    -21827239, -5839606,  -30745221, 13898782, 229458, 15978800, -12551817, -6495438, 29715968, 9444199 };
+static const fe25519 d2 = {-21827239, -5839606,  -30745221, 13898782, 229458,
+                           15978800,  -12551817, -6495438,  29715968, 9444199};

 /* sqrt(-1) */
-static const fe25519 sqrtm1 = {
-    -32595792, -7943725,  9377950,  3500415, 12389472, -272473, -25146209, -2005654, 326686, 11406482
-};
+static const fe25519 sqrtm1 = {-32595792, -7943725, 9377950,   3500415,
+                               12389472,  -272473,  -25146209, -2005654,
+                               326686,    11406482};

 /* A = 486662 */
-static const fe25519 curve25519_A = {
-    486662, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
+static const fe25519 curve25519_A = {486662, 0, 0, 0, 0, 0, 0, 0, 0, 0};
--- a/crypto/curve25519/ref10/fe_25_5/fe.h
+++ b/crypto/curve25519/ref10/fe_25_5/fe.h
@ -5,70 +5,70 @@
 void
 fe25519_frombytes(fe25519 h, const unsigned char *s)
 {
-    int64_t h0 = load_4(s);
-    int64_t h1 = load_3(s + 4) << 6;
-    int64_t h2 = load_3(s + 7) << 5;
-    int64_t h3 = load_3(s + 10) << 3;
-    int64_t h4 = load_3(s + 13) << 2;
-    int64_t h5 = load_4(s + 16);
-    int64_t h6 = load_3(s + 20) << 7;
-    int64_t h7 = load_3(s + 23) << 5;
-    int64_t h8 = load_3(s + 26) << 4;
-    int64_t h9 = (load_3(s + 29) & 8388607) << 2;
-
-    int64_t carry0;
-    int64_t carry1;
-    int64_t carry2;
-    int64_t carry3;
-    int64_t carry4;
-    int64_t carry5;
-    int64_t carry6;
-    int64_t carry7;
-    int64_t carry8;
-    int64_t carry9;
-
-    carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
-    h0 += carry9 * 19;
-    h9 -= carry9 * ((uint64_t) 1L << 25);
-    carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
-    h2 += carry1;
-    h1 -= carry1 * ((uint64_t) 1L << 25);
-    carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
-    h4 += carry3;
-    h3 -= carry3 * ((uint64_t) 1L << 25);
-    carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
-    h6 += carry5;
-    h5 -= carry5 * ((uint64_t) 1L << 25);
-    carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
-    h8 += carry7;
-    h7 -= carry7 * ((uint64_t) 1L << 25);
-
-    carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
-    h1 += carry0;
-    h0 -= carry0 * ((uint64_t) 1L << 26);
-    carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
-    h3 += carry2;
-    h2 -= carry2 * ((uint64_t) 1L << 26);
-    carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
-    h5 += carry4;
-    h4 -= carry4 * ((uint64_t) 1L << 26);
-    carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
-    h7 += carry6;
-    h6 -= carry6 * ((uint64_t) 1L << 26);
-    carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
-    h9 += carry8;
-    h8 -= carry8 * ((uint64_t) 1L << 26);
-
-    h[0] = (int32_t) h0;
-    h[1] = (int32_t) h1;
-    h[2] = (int32_t) h2;
-    h[3] = (int32_t) h3;
-    h[4] = (int32_t) h4;
-    h[5] = (int32_t) h5;
-    h[6] = (int32_t) h6;
-    h[7] = (int32_t) h7;
-    h[8] = (int32_t) h8;
-    h[9] = (int32_t) h9;
+  int64_t h0 = load_4(s);
+  int64_t h1 = load_3(s + 4) << 6;
+  int64_t h2 = load_3(s + 7) << 5;
+  int64_t h3 = load_3(s + 10) << 3;
+  int64_t h4 = load_3(s + 13) << 2;
+  int64_t h5 = load_4(s + 16);
+  int64_t h6 = load_3(s + 20) << 7;
+  int64_t h7 = load_3(s + 23) << 5;
+  int64_t h8 = load_3(s + 26) << 4;
+  int64_t h9 = (load_3(s + 29) & 8388607) << 2;
+
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+
+  carry9 = (h9 + (int64_t)(1L << 24)) >> 25;
+  h0 += carry9 * 19;
+  h9 -= carry9 * ((uint64_t)1L << 25);
+  carry1 = (h1 + (int64_t)(1L << 24)) >> 25;
+  h2 += carry1;
+  h1 -= carry1 * ((uint64_t)1L << 25);
+  carry3 = (h3 + (int64_t)(1L << 24)) >> 25;
+  h4 += carry3;
+  h3 -= carry3 * ((uint64_t)1L << 25);
+  carry5 = (h5 + (int64_t)(1L << 24)) >> 25;
+  h6 += carry5;
+  h5 -= carry5 * ((uint64_t)1L << 25);
+  carry7 = (h7 + (int64_t)(1L << 24)) >> 25;
+  h8 += carry7;
+  h7 -= carry7 * ((uint64_t)1L << 25);
+
+  carry0 = (h0 + (int64_t)(1L << 25)) >> 26;
+  h1 += carry0;
+  h0 -= carry0 * ((uint64_t)1L << 26);
+  carry2 = (h2 + (int64_t)(1L << 25)) >> 26;
+  h3 += carry2;
+  h2 -= carry2 * ((uint64_t)1L << 26);
+  carry4 = (h4 + (int64_t)(1L << 25)) >> 26;
+  h5 += carry4;
+  h4 -= carry4 * ((uint64_t)1L << 26);
+  carry6 = (h6 + (int64_t)(1L << 25)) >> 26;
+  h7 += carry6;
+  h6 -= carry6 * ((uint64_t)1L << 26);
+  carry8 = (h8 + (int64_t)(1L << 25)) >> 26;
+  h9 += carry8;
+  h8 -= carry8 * ((uint64_t)1L << 26);
+
+  h[0] = (int32_t)h0;
+  h[1] = (int32_t)h1;
+  h[2] = (int32_t)h2;
+  h[3] = (int32_t)h3;
+  h[4] = (int32_t)h4;
+  h[5] = (int32_t)h5;
+  h[6] = (int32_t)h6;
+  h[7] = (int32_t)h7;
+  h[8] = (int32_t)h8;
+  h[9] = (int32_t)h9;
 }

 /*
@ -99,76 +99,77 @@ fe25519_frombytes(fe25519 h, const unsigned char *s)
 static void
 fe25519_reduce(fe25519 h, const fe25519 f)
 {
-    int32_t h0 = f[0];
-    int32_t h1 = f[1];
-    int32_t h2 = f[2];
-    int32_t h3 = f[3];
-    int32_t h4 = f[4];
-    int32_t h5 = f[5];
-    int32_t h6 = f[6];
-    int32_t h7 = f[7];
-    int32_t h8 = f[8];
-    int32_t h9 = f[9];
-
-    int32_t q;
-    int32_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9;
-
-    q = (19 * h9 + ((uint32_t) 1L << 24)) >> 25;
-    q = (h0 + q) >> 26;
-    q = (h1 + q) >> 25;
-    q = (h2 + q) >> 26;
-    q = (h3 + q) >> 25;
-    q = (h4 + q) >> 26;
-    q = (h5 + q) >> 25;
-    q = (h6 + q) >> 26;
-    q = (h7 + q) >> 25;
-    q = (h8 + q) >> 26;
-    q = (h9 + q) >> 25;
-
-    /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
-    h0 += 19 * q;
-    /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
-
-    carry0 = h0 >> 26;
-    h1 += carry0;
-    h0 -= carry0 * ((uint32_t) 1L << 26);
-    carry1 = h1 >> 25;
-    h2 += carry1;
-    h1 -= carry1 * ((uint32_t) 1L << 25);
-    carry2 = h2 >> 26;
-    h3 += carry2;
-    h2 -= carry2 * ((uint32_t) 1L << 26);
-    carry3 = h3 >> 25;
-    h4 += carry3;
-    h3 -= carry3 * ((uint32_t) 1L << 25);
-    carry4 = h4 >> 26;
-    h5 += carry4;
-    h4 -= carry4 * ((uint32_t) 1L << 26);
-    carry5 = h5 >> 25;
-    h6 += carry5;
-    h5 -= carry5 * ((uint32_t) 1L << 25);
-    carry6 = h6 >> 26;
-    h7 += carry6;
-    h6 -= carry6 * ((uint32_t) 1L << 26);
-    carry7 = h7 >> 25;
-    h8 += carry7;
-    h7 -= carry7 * ((uint32_t) 1L << 25);
-    carry8 = h8 >> 26;
-    h9 += carry8;
-    h8 -= carry8 * ((uint32_t) 1L << 26);
-    carry9 = h9 >> 25;
-    h9 -= carry9 * ((uint32_t) 1L << 25);
-
-    h[0] = h0;
-    h[1] = h1;
-    h[2] = h2;
-    h[3] = h3;
-    h[4] = h4;
-    h[5] = h5;
-    h[6] = h6;
-    h[7] = h7;
-    h[8] = h8;
-    h[9] = h9;
+  int32_t h0 = f[0];
+  int32_t h1 = f[1];
+  int32_t h2 = f[2];
+  int32_t h3 = f[3];
+  int32_t h4 = f[4];
+  int32_t h5 = f[5];
+  int32_t h6 = f[6];
+  int32_t h7 = f[7];
+  int32_t h8 = f[8];
+  int32_t h9 = f[9];
+
+  int32_t q;
+  int32_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7,
+      carry8, carry9;
+
+  q = (19 * h9 + ((uint32_t)1L << 24)) >> 25;
+  q = (h0 + q) >> 26;
+  q = (h1 + q) >> 25;
+  q = (h2 + q) >> 26;
+  q = (h3 + q) >> 25;
+  q = (h4 + q) >> 26;
+  q = (h5 + q) >> 25;
+  q = (h6 + q) >> 26;
+  q = (h7 + q) >> 25;
+  q = (h8 + q) >> 26;
+  q = (h9 + q) >> 25;
+
+  /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
+  h0 += 19 * q;
+  /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
+
+  carry0 = h0 >> 26;
+  h1 += carry0;
+  h0 -= carry0 * ((uint32_t)1L << 26);
+  carry1 = h1 >> 25;
+  h2 += carry1;
+  h1 -= carry1 * ((uint32_t)1L << 25);
+  carry2 = h2 >> 26;
+  h3 += carry2;
+  h2 -= carry2 * ((uint32_t)1L << 26);
+  carry3 = h3 >> 25;
+  h4 += carry3;
+  h3 -= carry3 * ((uint32_t)1L << 25);
+  carry4 = h4 >> 26;
+  h5 += carry4;
+  h4 -= carry4 * ((uint32_t)1L << 26);
+  carry5 = h5 >> 25;
+  h6 += carry5;
+  h5 -= carry5 * ((uint32_t)1L << 25);
+  carry6 = h6 >> 26;
+  h7 += carry6;
+  h6 -= carry6 * ((uint32_t)1L << 26);
+  carry7 = h7 >> 25;
+  h8 += carry7;
+  h7 -= carry7 * ((uint32_t)1L << 25);
+  carry8 = h8 >> 26;
+  h9 += carry8;
+  h8 -= carry8 * ((uint32_t)1L << 26);
+  carry9 = h9 >> 25;
+  h9 -= carry9 * ((uint32_t)1L << 25);
+
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
+  h[5] = h5;
+  h[6] = h6;
+  h[7] = h7;
+  h[8] = h8;
+  h[9] = h9;
 }

 /*
@ -182,39 +183,39 @@ fe25519_reduce(fe25519 h, const fe25519 f)
 void
 fe25519_tobytes(unsigned char *s, const fe25519 h)
 {
-    fe25519 t;
-
-    fe25519_reduce(t, h);
-    s[0]  = t[0] >> 0;
-    s[1]  = t[0] >> 8;
-    s[2]  = t[0] >> 16;
-    s[3]  = (t[0] >> 24) | (t[1] * ((uint32_t) 1 << 2));
-    s[4]  = t[1] >> 6;
-    s[5]  = t[1] >> 14;
-    s[6]  = (t[1] >> 22) | (t[2] * ((uint32_t) 1 << 3));
-    s[7]  = t[2] >> 5;
-    s[8]  = t[2] >> 13;
-    s[9]  = (t[2] >> 21) | (t[3] * ((uint32_t) 1 << 5));
-    s[10] = t[3] >> 3;
-    s[11] = t[3] >> 11;
-    s[12] = (t[3] >> 19) | (t[4] * ((uint32_t) 1 << 6));
-    s[13] = t[4] >> 2;
-    s[14] = t[4] >> 10;
-    s[15] = t[4] >> 18;
-    s[16] = t[5] >> 0;
-    s[17] = t[5] >> 8;
-    s[18] = t[5] >> 16;
-    s[19] = (t[5] >> 24) | (t[6] * ((uint32_t) 1 << 1));
-    s[20] = t[6] >> 7;
-    s[21] = t[6] >> 15;
-    s[22] = (t[6] >> 23) | (t[7] * ((uint32_t) 1 << 3));
-    s[23] = t[7] >> 5;
-    s[24] = t[7] >> 13;
-    s[25] = (t[7] >> 21) | (t[8] * ((uint32_t) 1 << 4));
-    s[26] = t[8] >> 4;
-    s[27] = t[8] >> 12;
-    s[28] = (t[8] >> 20) | (t[9] * ((uint32_t) 1 << 6));
-    s[29] = t[9] >> 2;
-    s[30] = t[9] >> 10;
-    s[31] = t[9] >> 18;
+  fe25519 t;
+
+  fe25519_reduce(t, h);
+  s[0]  = t[0] >> 0;
+  s[1]  = t[0] >> 8;
+  s[2]  = t[0] >> 16;
+  s[3]  = (t[0] >> 24) | (t[1] * ((uint32_t)1 << 2));
+  s[4]  = t[1] >> 6;
+  s[5]  = t[1] >> 14;
+  s[6]  = (t[1] >> 22) | (t[2] * ((uint32_t)1 << 3));
+  s[7]  = t[2] >> 5;
+  s[8]  = t[2] >> 13;
+  s[9]  = (t[2] >> 21) | (t[3] * ((uint32_t)1 << 5));
+  s[10] = t[3] >> 3;
+  s[11] = t[3] >> 11;
+  s[12] = (t[3] >> 19) | (t[4] * ((uint32_t)1 << 6));
+  s[13] = t[4] >> 2;
+  s[14] = t[4] >> 10;
+  s[15] = t[4] >> 18;
+  s[16] = t[5] >> 0;
+  s[17] = t[5] >> 8;
+  s[18] = t[5] >> 16;
+  s[19] = (t[5] >> 24) | (t[6] * ((uint32_t)1 << 1));
+  s[20] = t[6] >> 7;
+  s[21] = t[6] >> 15;
+  s[22] = (t[6] >> 23) | (t[7] * ((uint32_t)1 << 3));
+  s[23] = t[7] >> 5;
+  s[24] = t[7] >> 13;
+  s[25] = (t[7] >> 21) | (t[8] * ((uint32_t)1 << 4));
+  s[26] = t[8] >> 4;
+  s[27] = t[8] >> 12;
+  s[28] = (t[8] >> 20) | (t[9] * ((uint32_t)1 << 6));
+  s[29] = t[9] >> 2;
+  s[30] = t[9] >> 10;
+  s[31] = t[9] >> 18;
 }
--- a/crypto/curve25519/sandy2x/consts_namespace.h
+++ b/crypto/curve25519/sandy2x/consts_namespace.h
@ -17,4 +17,3 @@
 #define REDMASK51 crypto_scalarmult_curve25519_sandy2x_REDMASK51

 #endif /* ifndef consts_namespace_H */
-
--- a/crypto/curve25519/sandy2x/curve25519_sandy2x.c
+++ b/crypto/curve25519/sandy2x/curve25519_sandy2x.c
@ -26,13 +26,14 @@ crypto_scalarmult_curve25519_sandy2x(unsigned char *q, const unsigned char *n,
                                     const unsigned char *p)
 {
  unsigned char *t = q;
-  fe             var[3];
-  fe51           x_51;
-  fe51           z_51;
-  unsigned int   i;
-
-  for (i = 0; i < 32; i++) {
-      t[i] = n[i];
+  fe var[3];
+  fe51 x_51;
+  fe51 z_51;
+  unsigned int i;
+
+  for(i = 0; i < 32; i++)
+  {
+    t[i] = n[i];
  }
  t[0] &= 248;
  t[31] &= 127;
@ -72,13 +73,14 @@ crypto_scalarmult_curve25519_sandy2x_base(unsigned char *q,
                                          const unsigned char *n)
 {
  unsigned char *t = q;
-  fe             var[3];
-  fe51           x_51;
-  fe51           z_51;
-  unsigned int   i;
-
-  for (i = 0;i < 32; i++) {
-      t[i] = n[i];
+  fe var[3];
+  fe51 x_51;
+  fe51 z_51;
+  unsigned int i;
+
+  for(i = 0; i < 32; i++)
+  {
+    t[i] = n[i];
  }
  t[0] &= 248;
  t[31] &= 127;
@ -106,9 +108,8 @@ crypto_scalarmult_curve25519_sandy2x_base(unsigned char *q,
 }

 struct crypto_scalarmult_curve25519_implementation
-crypto_scalarmult_curve25519_sandy2x_implementation = {
-    SODIUM_C99(.mult = ) crypto_scalarmult_curve25519_sandy2x,
-    SODIUM_C99(.mult_base = ) crypto_scalarmult_curve25519_sandy2x_base
-};
+    crypto_scalarmult_curve25519_sandy2x_implementation = {
+        SODIUM_C99(.mult =) crypto_scalarmult_curve25519_sandy2x,
+        SODIUM_C99(.mult_base =) crypto_scalarmult_curve25519_sandy2x_base};

 #endif
--- a/crypto/curve25519/sandy2x/fe.h
+++ b/crypto/curve25519/sandy2x/fe.h
@ -21,6 +21,7 @@ Bounds on each t[i] vary depending on context.

 #define fe_frombytes crypto_scalarmult_curve25519_sandy2x_fe_frombytes

-extern void fe_frombytes(fe, const unsigned char *);
+extern void
+fe_frombytes(fe, const unsigned char *);

 #endif
--- a/crypto/curve25519/sandy2x/fe51.h
+++ b/crypto/curve25519/sandy2x/fe51.h
@ -9,7 +9,8 @@
 #define fe51_H

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #include <stdint.h>
@ -17,16 +18,19 @@ extern "C" {

 #include "fe51_namespace.h"

-typedef struct
-{
+  typedef struct
+  {
    uint64_t v[5];
-}
-fe51;
-
-extern void fe51_pack(unsigned char *, const fe51 *);
-extern void fe51_mul(fe51 *, const fe51 *, const fe51 *);
-extern void fe51_nsquare(fe51 *, const fe51 *, int);
-extern void fe51_invert(fe51 *, const fe51 *);
+  } fe51;
+
+  extern void
+  fe51_pack(unsigned char *, const fe51 *);
+  extern void
+  fe51_mul(fe51 *, const fe51 *, const fe51 *);
+  extern void
+  fe51_nsquare(fe51 *, const fe51 *, int);
+  extern void
+  fe51_invert(fe51 *, const fe51 *);

 #ifdef __cplusplus
 }
--- a/crypto/curve25519/sandy2x/fe51_invert.c
+++ b/crypto/curve25519/sandy2x/fe51_invert.c
@ -12,47 +12,47 @@
 void
 fe51_invert(fe51 *r, const fe51 *x)
 {
-    fe51 z2;
-    fe51 z9;
-    fe51 z11;
-    fe51 z2_5_0;
-    fe51 z2_10_0;
-    fe51 z2_20_0;
-    fe51 z2_50_0;
-    fe51 z2_100_0;
-    fe51 t;
-
-    /* 2 */ fe51_square(&z2,x);
-    /* 4 */ fe51_square(&t,&z2);
-    /* 8 */ fe51_square(&t,&t);
-    /* 9 */ fe51_mul(&z9,&t,x);
-    /* 11 */ fe51_mul(&z11,&z9,&z2);
-    /* 22 */ fe51_square(&t,&z11);
-    /* 2^5 - 2^0 = 31 */ fe51_mul(&z2_5_0,&t,&z9);
-
-    /* 2^10 - 2^5 */ fe51_nsquare(&t,&z2_5_0, 5);
-    /* 2^10 - 2^0 */ fe51_mul(&z2_10_0,&t,&z2_5_0);
-
-    /* 2^20 - 2^10 */ fe51_nsquare(&t,&z2_10_0, 10);
-    /* 2^20 - 2^0 */ fe51_mul(&z2_20_0,&t,&z2_10_0);
-
-    /* 2^40 - 2^20 */ fe51_nsquare(&t,&z2_20_0, 20);
-    /* 2^40 - 2^0 */ fe51_mul(&t,&t,&z2_20_0);
-
-    /* 2^50 - 2^10 */ fe51_nsquare(&t,&t,10);
-    /* 2^50 - 2^0 */ fe51_mul(&z2_50_0,&t,&z2_10_0);
-
-    /* 2^100 - 2^50 */ fe51_nsquare(&t,&z2_50_0, 50);
-    /* 2^100 - 2^0 */ fe51_mul(&z2_100_0,&t,&z2_50_0);
-
-    /* 2^200 - 2^100 */ fe51_nsquare(&t,&z2_100_0, 100);
-    /* 2^200 - 2^0 */ fe51_mul(&t,&t,&z2_100_0);
-
-    /* 2^250 - 2^50 */ fe51_nsquare(&t,&t, 50);
-    /* 2^250 - 2^0 */ fe51_mul(&t,&t,&z2_50_0);
-
-    /* 2^255 - 2^5 */ fe51_nsquare(&t,&t,5);
-    /* 2^255 - 21 */ fe51_mul(r,&t,&z11);
+  fe51 z2;
+  fe51 z9;
+  fe51 z11;
+  fe51 z2_5_0;
+  fe51 z2_10_0;
+  fe51 z2_20_0;
+  fe51 z2_50_0;
+  fe51 z2_100_0;
+  fe51 t;
+
+  /* 2 */ fe51_square(&z2, x);
+  /* 4 */ fe51_square(&t, &z2);
+  /* 8 */ fe51_square(&t, &t);
+  /* 9 */ fe51_mul(&z9, &t, x);
+  /* 11 */ fe51_mul(&z11, &z9, &z2);
+  /* 22 */ fe51_square(&t, &z11);
+  /* 2^5 - 2^0 = 31 */ fe51_mul(&z2_5_0, &t, &z9);
+
+  /* 2^10 - 2^5 */ fe51_nsquare(&t, &z2_5_0, 5);
+  /* 2^10 - 2^0 */ fe51_mul(&z2_10_0, &t, &z2_5_0);
+
+  /* 2^20 - 2^10 */ fe51_nsquare(&t, &z2_10_0, 10);
+  /* 2^20 - 2^0 */ fe51_mul(&z2_20_0, &t, &z2_10_0);
+
+  /* 2^40 - 2^20 */ fe51_nsquare(&t, &z2_20_0, 20);
+  /* 2^40 - 2^0 */ fe51_mul(&t, &t, &z2_20_0);
+
+  /* 2^50 - 2^10 */ fe51_nsquare(&t, &t, 10);
+  /* 2^50 - 2^0 */ fe51_mul(&z2_50_0, &t, &z2_10_0);
+
+  /* 2^100 - 2^50 */ fe51_nsquare(&t, &z2_50_0, 50);
+  /* 2^100 - 2^0 */ fe51_mul(&z2_100_0, &t, &z2_50_0);
+
+  /* 2^200 - 2^100 */ fe51_nsquare(&t, &z2_100_0, 100);
+  /* 2^200 - 2^0 */ fe51_mul(&t, &t, &z2_100_0);
+
+  /* 2^250 - 2^50 */ fe51_nsquare(&t, &t, 50);
+  /* 2^250 - 2^0 */ fe51_mul(&t, &t, &z2_50_0);
+
+  /* 2^255 - 2^5 */ fe51_nsquare(&t, &t, 5);
+  /* 2^255 - 21 */ fe51_mul(r, &t, &z11);
 }

 #endif
--- a/crypto/curve25519/sandy2x/fe51_namespace.h
+++ b/crypto/curve25519/sandy2x/fe51_namespace.h
@ -1,16 +1,15 @@
 #ifndef fe51_namespace_H
 #define fe51_namespace_H

-#define  fe51              crypto_scalarmult_curve25519_sandy2x_fe51
-#define _fe51             _crypto_scalarmult_curve25519_sandy2x_fe51
-#define  fe51_pack         crypto_scalarmult_curve25519_sandy2x_fe51_pack
-#define _fe51_pack        _crypto_scalarmult_curve25519_sandy2x_fe51_pack
-#define  fe51_mul          crypto_scalarmult_curve25519_sandy2x_fe51_mul
-#define _fe51_mul         _crypto_scalarmult_curve25519_sandy2x_fe51_mul
-#define  fe51_nsquare      crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
-#define _fe51_nsquare     _crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
+#define fe51 crypto_scalarmult_curve25519_sandy2x_fe51
+#define _fe51 _crypto_scalarmult_curve25519_sandy2x_fe51
+#define fe51_pack crypto_scalarmult_curve25519_sandy2x_fe51_pack
+#define _fe51_pack _crypto_scalarmult_curve25519_sandy2x_fe51_pack
+#define fe51_mul crypto_scalarmult_curve25519_sandy2x_fe51_mul
+#define _fe51_mul _crypto_scalarmult_curve25519_sandy2x_fe51_mul
+#define fe51_nsquare crypto_scalarmult_curve25519_sandy2x_fe51_nsquare
+#define _fe51_nsquare _crypto_scalarmult_curve25519_sandy2x_fe51_nsquare

-#define  fe51_invert       crypto_scalarmult_curve25519_sandy2x_fe51_invert
+#define fe51_invert crypto_scalarmult_curve25519_sandy2x_fe51_invert

 #endif /* ifndef fe51_namespace_H */
-
--- a/crypto/curve25519/sandy2x/fe_frombytes_sandy2x.c
+++ b/crypto/curve25519/sandy2x/fe_frombytes_sandy2x.c
@ -10,9 +10,9 @@ static uint64_t
 load_3(const unsigned char *in)
 {
  uint64_t result;
-  result = (uint64_t) in[0];
-  result |= ((uint64_t) in[1]) << 8;
-  result |= ((uint64_t) in[2]) << 16;
+  result = (uint64_t)in[0];
+  result |= ((uint64_t)in[1]) << 8;
+  result |= ((uint64_t)in[2]) << 16;
  return result;
 }

@ -20,10 +20,10 @@ static uint64_t
 load_4(const unsigned char *in)
 {
  uint64_t result;
-  result = (uint64_t) in[0];
-  result |= ((uint64_t) in[1]) << 8;
-  result |= ((uint64_t) in[2]) << 16;
-  result |= ((uint64_t) in[3]) << 24;
+  result = (uint64_t)in[0];
+  result |= ((uint64_t)in[1]) << 8;
+  result |= ((uint64_t)in[2]) << 16;
+  result |= ((uint64_t)in[3]) << 24;
  return result;
 }

@ -51,17 +51,37 @@ fe_frombytes(fe h, const unsigned char *s)
  uint64_t carry8;
  uint64_t carry9;

-  carry9 = h9 >> 25; h0 += carry9 * 19; h9 &= 0x1FFFFFF;
-  carry1 = h1 >> 25; h2 += carry1; h1 &= 0x1FFFFFF;
-  carry3 = h3 >> 25; h4 += carry3; h3 &= 0x1FFFFFF;
-  carry5 = h5 >> 25; h6 += carry5; h5 &= 0x1FFFFFF;
-  carry7 = h7 >> 25; h8 += carry7; h7 &= 0x1FFFFFF;
+  carry9 = h9 >> 25;
+  h0 += carry9 * 19;
+  h9 &= 0x1FFFFFF;
+  carry1 = h1 >> 25;
+  h2 += carry1;
+  h1 &= 0x1FFFFFF;
+  carry3 = h3 >> 25;
+  h4 += carry3;
+  h3 &= 0x1FFFFFF;
+  carry5 = h5 >> 25;
+  h6 += carry5;
+  h5 &= 0x1FFFFFF;
+  carry7 = h7 >> 25;
+  h8 += carry7;
+  h7 &= 0x1FFFFFF;

-  carry0 = h0 >> 26; h1 += carry0; h0 &= 0x3FFFFFF;
-  carry2 = h2 >> 26; h3 += carry2; h2 &= 0x3FFFFFF;
-  carry4 = h4 >> 26; h5 += carry4; h4 &= 0x3FFFFFF;
-  carry6 = h6 >> 26; h7 += carry6; h6 &= 0x3FFFFFF;
-  carry8 = h8 >> 26; h9 += carry8; h8 &= 0x3FFFFFF;
+  carry0 = h0 >> 26;
+  h1 += carry0;
+  h0 &= 0x3FFFFFF;
+  carry2 = h2 >> 26;
+  h3 += carry2;
+  h2 &= 0x3FFFFFF;
+  carry4 = h4 >> 26;
+  h5 += carry4;
+  h4 &= 0x3FFFFFF;
+  carry6 = h6 >> 26;
+  h7 += carry6;
+  h6 &= 0x3FFFFFF;
+  carry8 = h8 >> 26;
+  h9 += carry8;
+  h8 &= 0x3FFFFFF;

  h[0] = h0;
  h[1] = h1;
--- a/crypto/curve25519/sandy2x/ladder.h
+++ b/crypto/curve25519/sandy2x/ladder.h
@ -2,17 +2,18 @@
 #define ladder_H

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #include "fe.h"
 #include "ladder_namespace.h"

-extern void ladder(fe *, const unsigned char *);
+  extern void
+  ladder(fe *, const unsigned char *);

 #ifdef __cplusplus
 }
 #endif

 #endif /* ifndef ladder_H */
-
--- a/crypto/curve25519/sandy2x/ladder_base.h
+++ b/crypto/curve25519/sandy2x/ladder_base.h
@ -2,17 +2,18 @@
 #define ladder_base_H

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #include "fe.h"
 #include "ladder_base_namespace.h"

-extern void ladder_base(fe *, const unsigned char *);
+  extern void
+  ladder_base(fe *, const unsigned char *);

 #ifdef __cplusplus
 }
 #endif

 #endif /* ifndef ladder_base_H */
-
--- a/crypto/curve25519/sandy2x/ladder_base_namespace.h
+++ b/crypto/curve25519/sandy2x/ladder_base_namespace.h
@ -1,8 +1,7 @@
 #ifndef ladder_base_namespace_H
 #define ladder_base_namespace_H

-#define  ladder_base  crypto_scalarmult_curve25519_sandy2x_ladder_base
+#define ladder_base crypto_scalarmult_curve25519_sandy2x_ladder_base
 #define _ladder_base _crypto_scalarmult_curve25519_sandy2x_ladder_base

 #endif /* ifndef ladder_base_namespace_H */
-
--- a/crypto/curve25519/sandy2x/ladder_namespace.h
+++ b/crypto/curve25519/sandy2x/ladder_namespace.h
@ -1,8 +1,7 @@
 #ifndef ladder_namespace_H
 #define ladder_namespace_H

-#define  ladder  crypto_scalarmult_curve25519_sandy2x_ladder
+#define ladder crypto_scalarmult_curve25519_sandy2x_ladder
 #define _ladder _crypto_scalarmult_curve25519_sandy2x_ladder

 #endif /* ifndef ladder_namespace_H */
-
--- a/crypto/curve25519/scalarmult_curve25519.h
+++ b/crypto/curve25519/scalarmult_curve25519.h
@ -2,10 +2,10 @@
 #ifndef scalarmult_poly1305_H
 #define scalarmult_poly1305_H

-typedef struct crypto_scalarmult_curve25519_implementation {
-    int (*mult)(unsigned char *q, const unsigned char *n,
-                const unsigned char *p);
-    int (*mult_base)(unsigned char *q, const unsigned char *n);
+typedef struct crypto_scalarmult_curve25519_implementation
+{
+  int (*mult)(unsigned char *q, const unsigned char *n, const unsigned char *p);
+  int (*mult_base)(unsigned char *q, const unsigned char *n);
 } crypto_scalarmult_curve25519_implementation;

 #endif
--- a/crypto/ed25519/ref10/sign_ed25519_ref10.h
+++ b/crypto/ed25519/ref10/sign_ed25519_ref10.h
@ -1,18 +1,17 @@
 #ifndef sign_ed25519_ref10_H
 #define sign_ed25519_ref10_H

-void _crypto_sign_ed25519_ref10_hinit(crypto_hash_sha512_state *hs,
-                                      int prehashed);
+void
+_crypto_sign_ed25519_ref10_hinit(crypto_hash_sha512_state *hs, int prehashed);

-int _crypto_sign_ed25519_detached(unsigned char *sig,
-                                  unsigned long long *siglen_p,
-                                  const unsigned char *m,
-                                  unsigned long long mlen,
-                                  const unsigned char *sk, int prehashed);
+int
+_crypto_sign_ed25519_detached(unsigned char *sig, unsigned long long *siglen_p,
+                              const unsigned char *m, unsigned long long mlen,
+                              const unsigned char *sk, int prehashed);

-int _crypto_sign_ed25519_verify_detached(const unsigned char *sig,
-                                         const unsigned char *m,
-                                         unsigned long long   mlen,
-                                         const unsigned char *pk,
-                                         int prehashed);
+int
+_crypto_sign_ed25519_verify_detached(const unsigned char *sig,
+                                     const unsigned char *m,
+                                     unsigned long long mlen,
+                                     const unsigned char *pk, int prehashed);
 #endif
--- a/crypto/include/libntrup/ntru_api.h
+++ b/crypto/include/libntrup/ntru_api.h
@ -1,12 +1,22 @@

-int crypto_kem_enc_ref(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
-  
-int crypto_kem_dec_ref(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
+int
+crypto_kem_enc_ref(unsigned char *cstr, unsigned char *k,
+                   const unsigned char *pk);

-int crypto_kem_keypair_ref(unsigned char *pk, unsigned char * sk);
+int
+crypto_kem_dec_ref(unsigned char *k, const unsigned char *cstr,
+                   const unsigned char *sk);

-int crypto_kem_enc_avx2(unsigned char *cstr, unsigned char *k, const unsigned char *pk);
-  
-int crypto_kem_dec_avx2(unsigned char *k, const unsigned char *cstr, const unsigned char *sk);
+int
+crypto_kem_keypair_ref(unsigned char *pk, unsigned char *sk);

-int crypto_kem_keypair_avx2(unsigned char *pk, unsigned char * sk);
+int
+crypto_kem_enc_avx2(unsigned char *cstr, unsigned char *k,
+                    const unsigned char *pk);
+
+int
+crypto_kem_dec_avx2(unsigned char *k, const unsigned char *cstr,
+                    const unsigned char *sk);
+
+int
+crypto_kem_keypair_avx2(unsigned char *pk, unsigned char *sk);
--- a/crypto/include/sodium/common.h
+++ b/crypto/include/sodium/common.h
@ -5,42 +5,42 @@
 #include <stdlib.h>
 #include <string.h>

-#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
+#define COMPILER_ASSERT(X) (void)sizeof(char[(X) ? 1 : -1])

 #ifdef HAVE_TI_MODE
-# if defined(__SIZEOF_INT128__)
+#if defined(__SIZEOF_INT128__)
 typedef unsigned __int128 uint128_t;
-# else
+#else
 typedef unsigned uint128_t __attribute__((mode(TI)));
-# endif
+#endif
 #endif

 #define ROTL32(X, B) rotl32((X), (B))
 static inline uint32_t
 rotl32(const uint32_t x, const int b)
 {
-    return (x << b) | (x >> (32 - b));
+  return (x << b) | (x >> (32 - b));
 }

 #define ROTL64(X, B) rotl64((X), (B))
 static inline uint64_t
 rotl64(const uint64_t x, const int b)
 {
-    return (x << b) | (x >> (64 - b));
+  return (x << b) | (x >> (64 - b));
 }

 #define ROTR32(X, B) rotr32((X), (B))
 static inline uint32_t
 rotr32(const uint32_t x, const int b)
 {
-    return (x >> b) | (x << (32 - b));
+  return (x >> b) | (x << (32 - b));
 }

 #define ROTR64(X, B) rotr64((X), (B))
 static inline uint64_t
 rotr64(const uint64_t x, const int b)
 {
-    return (x >> b) | (x << (64 - b));
+  return (x >> b) | (x << (64 - b));
 }

 #define LOAD64_LE(SRC) load64_le(SRC)
@ -48,19 +48,19 @@ static inline uint64_t
 load64_le(const uint8_t src[8])
 {
 #ifdef NATIVE_LITTLE_ENDIAN
-    uint64_t w;
-    memcpy(&w, src, sizeof w);
-    return w;
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
 #else
-    uint64_t w = (uint64_t) src[0];
-    w |= (uint64_t) src[1] <<  8;
-    w |= (uint64_t) src[2] << 16;
-    w |= (uint64_t) src[3] << 24;
-    w |= (uint64_t) src[4] << 32;
-    w |= (uint64_t) src[5] << 40;
-    w |= (uint64_t) src[6] << 48;
-    w |= (uint64_t) src[7] << 56;
-    return w;
+  uint64_t w = (uint64_t)src[0];
+  w |= (uint64_t)src[1] << 8;
+  w |= (uint64_t)src[2] << 16;
+  w |= (uint64_t)src[3] << 24;
+  w |= (uint64_t)src[4] << 32;
+  w |= (uint64_t)src[5] << 40;
+  w |= (uint64_t)src[6] << 48;
+  w |= (uint64_t)src[7] << 56;
+  return w;
 #endif
 }

@ -69,16 +69,23 @@ static inline void
 store64_le(uint8_t dst[8], uint64_t w)
 {
 #ifdef NATIVE_LITTLE_ENDIAN
-    memcpy(dst, &w, sizeof w);
+  memcpy(dst, &w, sizeof w);
 #else
-    dst[0] = (uint8_t) w; w >>= 8;
-    dst[1] = (uint8_t) w; w >>= 8;
-    dst[2] = (uint8_t) w; w >>= 8;
-    dst[3] = (uint8_t) w; w >>= 8;
-    dst[4] = (uint8_t) w; w >>= 8;
-    dst[5] = (uint8_t) w; w >>= 8;
-    dst[6] = (uint8_t) w; w >>= 8;
-    dst[7] = (uint8_t) w;
+  dst[0] = (uint8_t)w;
+  w >>= 8;
+  dst[1] = (uint8_t)w;
+  w >>= 8;
+  dst[2] = (uint8_t)w;
+  w >>= 8;
+  dst[3] = (uint8_t)w;
+  w >>= 8;
+  dst[4] = (uint8_t)w;
+  w >>= 8;
+  dst[5] = (uint8_t)w;
+  w >>= 8;
+  dst[6] = (uint8_t)w;
+  w >>= 8;
+  dst[7]     = (uint8_t)w;
 #endif
 }

@ -87,15 +94,15 @@ static inline uint32_t
 load32_le(const uint8_t src[4])
 {
 #ifdef NATIVE_LITTLE_ENDIAN
-    uint32_t w;
-    memcpy(&w, src, sizeof w);
-    return w;
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
 #else
-    uint32_t w = (uint32_t) src[0];
-    w |= (uint32_t) src[1] <<  8;
-    w |= (uint32_t) src[2] << 16;
-    w |= (uint32_t) src[3] << 24;
-    return w;
+  uint32_t w = (uint32_t)src[0];
+  w |= (uint32_t)src[1] << 8;
+  w |= (uint32_t)src[2] << 16;
+  w |= (uint32_t)src[3] << 24;
+  return w;
 #endif
 }

@ -104,12 +111,15 @@ static inline void
 store32_le(uint8_t dst[4], uint32_t w)
 {
 #ifdef NATIVE_LITTLE_ENDIAN
-    memcpy(dst, &w, sizeof w);
+  memcpy(dst, &w, sizeof w);
 #else
-    dst[0] = (uint8_t) w; w >>= 8;
-    dst[1] = (uint8_t) w; w >>= 8;
-    dst[2] = (uint8_t) w; w >>= 8;
-    dst[3] = (uint8_t) w;
+  dst[0] = (uint8_t)w;
+  w >>= 8;
+  dst[1] = (uint8_t)w;
+  w >>= 8;
+  dst[2] = (uint8_t)w;
+  w >>= 8;
+  dst[3]     = (uint8_t)w;
 #endif
 }

@ -120,19 +130,19 @@ static inline uint64_t
 load64_be(const uint8_t src[8])
 {
 #ifdef NATIVE_BIG_ENDIAN
-    uint64_t w;
-    memcpy(&w, src, sizeof w);
-    return w;
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
 #else
-    uint64_t w = (uint64_t) src[7];
-    w |= (uint64_t) src[6] <<  8;
-    w |= (uint64_t) src[5] << 16;
-    w |= (uint64_t) src[4] << 24;
-    w |= (uint64_t) src[3] << 32;
-    w |= (uint64_t) src[2] << 40;
-    w |= (uint64_t) src[1] << 48;
-    w |= (uint64_t) src[0] << 56;
-    return w;
+  uint64_t w = (uint64_t)src[7];
+  w |= (uint64_t)src[6] << 8;
+  w |= (uint64_t)src[5] << 16;
+  w |= (uint64_t)src[4] << 24;
+  w |= (uint64_t)src[3] << 32;
+  w |= (uint64_t)src[2] << 40;
+  w |= (uint64_t)src[1] << 48;
+  w |= (uint64_t)src[0] << 56;
+  return w;
 #endif
 }

@ -141,16 +151,23 @@ static inline void
 store64_be(uint8_t dst[8], uint64_t w)
 {
 #ifdef NATIVE_BIG_ENDIAN
-    memcpy(dst, &w, sizeof w);
+  memcpy(dst, &w, sizeof w);
 #else
-    dst[7] = (uint8_t) w; w >>= 8;
-    dst[6] = (uint8_t) w; w >>= 8;
-    dst[5] = (uint8_t) w; w >>= 8;
-    dst[4] = (uint8_t) w; w >>= 8;
-    dst[3] = (uint8_t) w; w >>= 8;
-    dst[2] = (uint8_t) w; w >>= 8;
-    dst[1] = (uint8_t) w; w >>= 8;
-    dst[0] = (uint8_t) w;
+  dst[7] = (uint8_t)w;
+  w >>= 8;
+  dst[6] = (uint8_t)w;
+  w >>= 8;
+  dst[5] = (uint8_t)w;
+  w >>= 8;
+  dst[4] = (uint8_t)w;
+  w >>= 8;
+  dst[3] = (uint8_t)w;
+  w >>= 8;
+  dst[2] = (uint8_t)w;
+  w >>= 8;
+  dst[1] = (uint8_t)w;
+  w >>= 8;
+  dst[0]     = (uint8_t)w;
 #endif
 }

@ -159,15 +176,15 @@ static inline uint32_t
 load32_be(const uint8_t src[4])
 {
 #ifdef NATIVE_BIG_ENDIAN
-    uint32_t w;
-    memcpy(&w, src, sizeof w);
-    return w;
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
 #else
-    uint32_t w = (uint32_t) src[3];
-    w |= (uint32_t) src[2] <<  8;
-    w |= (uint32_t) src[1] << 16;
-    w |= (uint32_t) src[0] << 24;
-    return w;
+  uint32_t w = (uint32_t)src[3];
+  w |= (uint32_t)src[2] << 8;
+  w |= (uint32_t)src[1] << 16;
+  w |= (uint32_t)src[0] << 24;
+  return w;
 #endif
 }

@ -176,12 +193,15 @@ static inline void
 store32_be(uint8_t dst[4], uint32_t w)
 {
 #ifdef NATIVE_BIG_ENDIAN
-    memcpy(dst, &w, sizeof w);
+  memcpy(dst, &w, sizeof w);
 #else
-    dst[3] = (uint8_t) w; w >>= 8;
-    dst[2] = (uint8_t) w; w >>= 8;
-    dst[1] = (uint8_t) w; w >>= 8;
-    dst[0] = (uint8_t) w;
+  dst[3] = (uint8_t)w;
+  w >>= 8;
+  dst[2] = (uint8_t)w;
+  w >>= 8;
+  dst[1] = (uint8_t)w;
+  w >>= 8;
+  dst[0] = (uint8_t)w;
 #endif
 }

@ -189,58 +209,61 @@ store32_be(uint8_t dst[4], uint32_t w)
 static inline void
 xor_buf(unsigned char *out, const unsigned char *in, size_t n)
 {
-    size_t i;
+  size_t i;

-    for (i = 0; i < n; i++) {
-        out[i] ^= in[i];
-    }
+  for(i = 0; i < n; i++)
+  {
+    out[i] ^= in[i];
+  }
 }

 #if !defined(__clang__) && !defined(__GNUC__)
-# ifdef __attribute__
-#  undef __attribute__
-# endif
-# define __attribute__(a)
+#ifdef __attribute__
+#undef __attribute__
+#endif
+#define __attribute__(a)
 #endif

 #ifndef CRYPTO_ALIGN
-# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
-#  define CRYPTO_ALIGN(x) __declspec(align(x))
-# else
-#  define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
-# endif
-#endif
-
-#if defined(_MSC_VER) && \
-    (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
-
-# include <intrin.h>
-
-# define HAVE_INTRIN_H    1
-# define HAVE_MMINTRIN_H  1
-# define HAVE_EMMINTRIN_H 1
-# define HAVE_PMMINTRIN_H 1
-# define HAVE_TMMINTRIN_H 1
-# define HAVE_SMMINTRIN_H 1
-# define HAVE_AVXINTRIN_H 1
-# if _MSC_VER >= 1600
-#  define HAVE_WMMINTRIN_H 1
-# endif
-# if _MSC_VER >= 1700 && defined(_M_X64)
-#  define HAVE_AVX2INTRIN_H 1
-# endif
+#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
+#define CRYPTO_ALIGN(x) __declspec(align(x))
+#else
+#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
+#endif
+#endif
+
+#if defined(_MSC_VER) \
+    && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
+
+#include <intrin.h>
+
+#define HAVE_INTRIN_H 1
+#define HAVE_MMINTRIN_H 1
+#define HAVE_EMMINTRIN_H 1
+#define HAVE_PMMINTRIN_H 1
+#define HAVE_TMMINTRIN_H 1
+#define HAVE_SMMINTRIN_H 1
+#define HAVE_AVXINTRIN_H 1
+#if _MSC_VER >= 1600
+#define HAVE_WMMINTRIN_H 1
+#endif
+#if _MSC_VER >= 1700 && defined(_M_X64)
+#define HAVE_AVX2INTRIN_H 1
+#endif
 #elif defined(HAVE_INTRIN_H)
-# include <intrin.h>
+#include <intrin.h>
 #endif

 #ifdef HAVE_LIBCTGRIND
-extern void ct_poison  (const void *, size_t);
-extern void ct_unpoison(const void *, size_t);
-# define POISON(X, L)   ct_poison((X), (L))
-# define UNPOISON(X, L) ct_unpoison((X), (L))
+extern void
+ct_poison(const void *, size_t);
+extern void
+ct_unpoison(const void *, size_t);
+#define POISON(X, L) ct_poison((X), (L))
+#define UNPOISON(X, L) ct_unpoison((X), (L))
 #else
-# define POISON(X, L)   (void) 0
-# define UNPOISON(X, L) (void) 0
+#define POISON(X, L) (void)0
+#define UNPOISON(X, L) (void)0
 #endif

 #endif
--- a/crypto/include/sodium/crypto_box.h
+++ b/crypto/include/sodium/crypto_box.h
@ -14,157 +14,189 @@
 #include "export.h"

 #ifdef __cplusplus
-# ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wlong-long"
-# endif
-extern "C" {
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wlong-long"
+#endif
+extern "C"
+{
 #endif

 #define crypto_box_SEEDBYTES crypto_box_curve25519xsalsa20poly1305_SEEDBYTES
-SODIUM_EXPORT
-size_t  crypto_box_seedbytes(void);
-
-#define crypto_box_PUBLICKEYBYTES crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES
-SODIUM_EXPORT
-size_t  crypto_box_publickeybytes(void);
-
-#define crypto_box_SECRETKEYBYTES crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES
-SODIUM_EXPORT
-size_t  crypto_box_secretkeybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_seedbytes(void);
+
+#define crypto_box_PUBLICKEYBYTES \
+  crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES
+  SODIUM_EXPORT
+  size_t
+  crypto_box_publickeybytes(void);
+
+#define crypto_box_SECRETKEYBYTES \
+  crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES
+  SODIUM_EXPORT
+  size_t
+  crypto_box_secretkeybytes(void);

 #define crypto_box_NONCEBYTES crypto_box_curve25519xsalsa20poly1305_NONCEBYTES
-SODIUM_EXPORT
-size_t  crypto_box_noncebytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_noncebytes(void);

 #define crypto_box_MACBYTES crypto_box_curve25519xsalsa20poly1305_MACBYTES
-SODIUM_EXPORT
-size_t  crypto_box_macbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_macbytes(void);

-#define crypto_box_MESSAGEBYTES_MAX crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX
-SODIUM_EXPORT
-size_t  crypto_box_messagebytes_max(void);
+#define crypto_box_MESSAGEBYTES_MAX \
+  crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX
+  SODIUM_EXPORT
+  size_t
+  crypto_box_messagebytes_max(void);

 #define crypto_box_PRIMITIVE "curve25519xsalsa20poly1305"
-SODIUM_EXPORT
-const char *crypto_box_primitive(void);
-
-SODIUM_EXPORT
-int crypto_box_seed_keypair(unsigned char *pk, unsigned char *sk,
-                            const unsigned char *seed);
-
-SODIUM_EXPORT
-int crypto_box_keypair(unsigned char *pk, unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_box_easy(unsigned char *c, const unsigned char *m,
-                    unsigned long long mlen, const unsigned char *n,
-                    const unsigned char *pk, const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_box_open_easy(unsigned char *m, const unsigned char *c,
-                         unsigned long long clen, const unsigned char *n,
-                         const unsigned char *pk, const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_box_detached(unsigned char *c, unsigned char *mac,
-                        const unsigned char *m, unsigned long long mlen,
-                        const unsigned char *n, const unsigned char *pk,
-                        const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_box_open_detached(unsigned char *m, const unsigned char *c,
-                             const unsigned char *mac,
-                             unsigned long long clen,
-                             const unsigned char *n,
-                             const unsigned char *pk,
-                             const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
-
-/* -- Precomputation interface -- */
-
-#define crypto_box_BEFORENMBYTES crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES
-SODIUM_EXPORT
-size_t  crypto_box_beforenmbytes(void);
-
-SODIUM_EXPORT
-int crypto_box_beforenm(unsigned char *k, const unsigned char *pk,
-                        const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_box_easy_afternm(unsigned char *c, const unsigned char *m,
-                            unsigned long long mlen, const unsigned char *n,
-                            const unsigned char *k);
-
-SODIUM_EXPORT
-int crypto_box_open_easy_afternm(unsigned char *m, const unsigned char *c,
-                                 unsigned long long clen, const unsigned char *n,
-                                 const unsigned char *k)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_box_detached_afternm(unsigned char *c, unsigned char *mac,
-                                const unsigned char *m, unsigned long long mlen,
-                                const unsigned char *n, const unsigned char *k);
-
-SODIUM_EXPORT
-int crypto_box_open_detached_afternm(unsigned char *m, const unsigned char *c,
-                                     const unsigned char *mac,
-                                     unsigned long long clen, const unsigned char *n,
-                                     const unsigned char *k)
-            __attribute__ ((warn_unused_result));
-
-/* -- Ephemeral SK interface -- */
+  SODIUM_EXPORT
+  const char *
+  crypto_box_primitive(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_seed_keypair(unsigned char *pk, unsigned char *sk,
+                          const unsigned char *seed);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_keypair(unsigned char *pk, unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_easy(unsigned char *c, const unsigned char *m,
+                  unsigned long long mlen, const unsigned char *n,
+                  const unsigned char *pk, const unsigned char *sk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_box_open_easy(unsigned char *m, const unsigned char *c,
+                       unsigned long long clen, const unsigned char *n,
+                       const unsigned char *pk, const unsigned char *sk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_box_detached(unsigned char *c, unsigned char *mac,
+                      const unsigned char *m, unsigned long long mlen,
+                      const unsigned char *n, const unsigned char *pk,
+                      const unsigned char *sk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_box_open_detached(unsigned char *m, const unsigned char *c,
+                           const unsigned char *mac, unsigned long long clen,
+                           const unsigned char *n, const unsigned char *pk,
+                           const unsigned char *sk)
+      __attribute__((warn_unused_result));
+
+  /* -- Precomputation interface -- */
+
+#define crypto_box_BEFORENMBYTES \
+  crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES
+  SODIUM_EXPORT
+  size_t
+  crypto_box_beforenmbytes(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_beforenm(unsigned char *k, const unsigned char *pk,
+                      const unsigned char *sk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_box_easy_afternm(unsigned char *c, const unsigned char *m,
+                          unsigned long long mlen, const unsigned char *n,
+                          const unsigned char *k);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_open_easy_afternm(unsigned char *m, const unsigned char *c,
+                               unsigned long long clen, const unsigned char *n,
+                               const unsigned char *k)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_box_detached_afternm(unsigned char *c, unsigned char *mac,
+                              const unsigned char *m, unsigned long long mlen,
+                              const unsigned char *n, const unsigned char *k);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_open_detached_afternm(unsigned char *m, const unsigned char *c,
+                                   const unsigned char *mac,
+                                   unsigned long long clen,
+                                   const unsigned char *n,
+                                   const unsigned char *k)
+      __attribute__((warn_unused_result));
+
+  /* -- Ephemeral SK interface -- */

 #define crypto_box_SEALBYTES (crypto_box_PUBLICKEYBYTES + crypto_box_MACBYTES)
-SODIUM_EXPORT
-size_t crypto_box_sealbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_sealbytes(void);

-SODIUM_EXPORT
-int crypto_box_seal(unsigned char *c, const unsigned char *m,
-                    unsigned long long mlen, const unsigned char *pk);
+  SODIUM_EXPORT
+  int
+  crypto_box_seal(unsigned char *c, const unsigned char *m,
+                  unsigned long long mlen, const unsigned char *pk);

-SODIUM_EXPORT
-int crypto_box_seal_open(unsigned char *m, const unsigned char *c,
-                         unsigned long long clen,
-                         const unsigned char *pk, const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
+  SODIUM_EXPORT
+  int
+  crypto_box_seal_open(unsigned char *m, const unsigned char *c,
+                       unsigned long long clen, const unsigned char *pk,
+                       const unsigned char *sk)
+      __attribute__((warn_unused_result));

-/* -- NaCl compatibility interface ; Requires padding -- */
+  /* -- NaCl compatibility interface ; Requires padding -- */

 #define crypto_box_ZEROBYTES crypto_box_curve25519xsalsa20poly1305_ZEROBYTES
-SODIUM_EXPORT
-size_t  crypto_box_zerobytes(void);
-
-#define crypto_box_BOXZEROBYTES crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES
-SODIUM_EXPORT
-size_t  crypto_box_boxzerobytes(void);
-
-SODIUM_EXPORT
-int crypto_box(unsigned char *c, const unsigned char *m,
-               unsigned long long mlen, const unsigned char *n,
-               const unsigned char *pk, const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_box_open(unsigned char *m, const unsigned char *c,
-                    unsigned long long clen, const unsigned char *n,
-                    const unsigned char *pk, const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_box_afternm(unsigned char *c, const unsigned char *m,
-                       unsigned long long mlen, const unsigned char *n,
-                       const unsigned char *k);
-
-SODIUM_EXPORT
-int crypto_box_open_afternm(unsigned char *m, const unsigned char *c,
-                            unsigned long long clen, const unsigned char *n,
-                            const unsigned char *k)
-            __attribute__ ((warn_unused_result));
+  SODIUM_EXPORT
+  size_t
+  crypto_box_zerobytes(void);
+
+#define crypto_box_BOXZEROBYTES \
+  crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES
+  SODIUM_EXPORT
+  size_t
+  crypto_box_boxzerobytes(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_box(unsigned char *c, const unsigned char *m, unsigned long long mlen,
+             const unsigned char *n, const unsigned char *pk,
+             const unsigned char *sk) __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_box_open(unsigned char *m, const unsigned char *c,
+                  unsigned long long clen, const unsigned char *n,
+                  const unsigned char *pk, const unsigned char *sk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_box_afternm(unsigned char *c, const unsigned char *m,
+                     unsigned long long mlen, const unsigned char *n,
+                     const unsigned char *k);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_open_afternm(unsigned char *m, const unsigned char *c,
+                          unsigned long long clen, const unsigned char *n,
+                          const unsigned char *k)
+      __attribute__((warn_unused_result));

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_box_curve25519xsalsa20poly1305.h
+++ b/crypto/include/sodium/crypto_box_curve25519xsalsa20poly1305.h
@ -6,101 +6,114 @@
 #include "export.h"

 #ifdef __cplusplus
-# ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wlong-long"
-# endif
-extern "C" {
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wlong-long"
+#endif
+extern "C"
+{
 #endif

 #define crypto_box_curve25519xsalsa20poly1305_SEEDBYTES 32U
-SODIUM_EXPORT
-size_t crypto_box_curve25519xsalsa20poly1305_seedbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_curve25519xsalsa20poly1305_seedbytes(void);

 #define crypto_box_curve25519xsalsa20poly1305_PUBLICKEYBYTES 32U
-SODIUM_EXPORT
-size_t crypto_box_curve25519xsalsa20poly1305_publickeybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_curve25519xsalsa20poly1305_publickeybytes(void);

 #define crypto_box_curve25519xsalsa20poly1305_SECRETKEYBYTES 32U
-SODIUM_EXPORT
-size_t crypto_box_curve25519xsalsa20poly1305_secretkeybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_curve25519xsalsa20poly1305_secretkeybytes(void);

 #define crypto_box_curve25519xsalsa20poly1305_BEFORENMBYTES 32U
-SODIUM_EXPORT
-size_t crypto_box_curve25519xsalsa20poly1305_beforenmbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_curve25519xsalsa20poly1305_beforenmbytes(void);

 #define crypto_box_curve25519xsalsa20poly1305_NONCEBYTES 24U
-SODIUM_EXPORT
-size_t crypto_box_curve25519xsalsa20poly1305_noncebytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_curve25519xsalsa20poly1305_noncebytes(void);

 #define crypto_box_curve25519xsalsa20poly1305_MACBYTES 16U
-SODIUM_EXPORT
-size_t crypto_box_curve25519xsalsa20poly1305_macbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_curve25519xsalsa20poly1305_macbytes(void);

-/* Only for the libsodium API - The NaCl compatibility API would require BOXZEROBYTES extra bytes */
+/* Only for the libsodium API - The NaCl compatibility API would require
+ * BOXZEROBYTES extra bytes */
 #define crypto_box_curve25519xsalsa20poly1305_MESSAGEBYTES_MAX \
-    (crypto_stream_xsalsa20_MESSAGEBYTES_MAX - crypto_box_curve25519xsalsa20poly1305_MACBYTES)
-SODIUM_EXPORT
-size_t crypto_box_curve25519xsalsa20poly1305_messagebytes_max(void);
-
-SODIUM_EXPORT
-int crypto_box_curve25519xsalsa20poly1305_seed_keypair(unsigned char *pk,
-                                                       unsigned char *sk,
-                                                       const unsigned char *seed);
-
-SODIUM_EXPORT
-int crypto_box_curve25519xsalsa20poly1305_keypair(unsigned char *pk,
-                                                  unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_box_curve25519xsalsa20poly1305_beforenm(unsigned char *k,
-                                                   const unsigned char *pk,
-                                                   const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
-
-/* -- NaCl compatibility interface ; Requires padding -- */
+  (crypto_stream_xsalsa20_MESSAGEBYTES_MAX                     \
+   - crypto_box_curve25519xsalsa20poly1305_MACBYTES)
+  SODIUM_EXPORT
+  size_t
+  crypto_box_curve25519xsalsa20poly1305_messagebytes_max(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_curve25519xsalsa20poly1305_seed_keypair(unsigned char *pk,
+                                                     unsigned char *sk,
+                                                     const unsigned char *seed);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_curve25519xsalsa20poly1305_keypair(unsigned char *pk,
+                                                unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_curve25519xsalsa20poly1305_beforenm(unsigned char *k,
+                                                 const unsigned char *pk,
+                                                 const unsigned char *sk)
+      __attribute__((warn_unused_result));
+
+  /* -- NaCl compatibility interface ; Requires padding -- */

 #define crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES 16U
-SODIUM_EXPORT
-size_t crypto_box_curve25519xsalsa20poly1305_boxzerobytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_box_curve25519xsalsa20poly1305_boxzerobytes(void);

 #define crypto_box_curve25519xsalsa20poly1305_ZEROBYTES \
-    (crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES + \
-     crypto_box_curve25519xsalsa20poly1305_MACBYTES)
-SODIUM_EXPORT
-size_t crypto_box_curve25519xsalsa20poly1305_zerobytes(void);
-
-SODIUM_EXPORT
-int crypto_box_curve25519xsalsa20poly1305(unsigned char *c,
-                                          const unsigned char *m,
-                                          unsigned long long mlen,
-                                          const unsigned char *n,
-                                          const unsigned char *pk,
-                                          const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_box_curve25519xsalsa20poly1305_open(unsigned char *m,
-                                               const unsigned char *c,
-                                               unsigned long long clen,
-                                               const unsigned char *n,
-                                               const unsigned char *pk,
-                                               const unsigned char *sk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_box_curve25519xsalsa20poly1305_afternm(unsigned char *c,
-                                                  const unsigned char *m,
-                                                  unsigned long long mlen,
-                                                  const unsigned char *n,
-                                                  const unsigned char *k);
-
-SODIUM_EXPORT
-int crypto_box_curve25519xsalsa20poly1305_open_afternm(unsigned char *m,
-                                                       const unsigned char *c,
-                                                       unsigned long long clen,
-                                                       const unsigned char *n,
-                                                       const unsigned char *k)
-            __attribute__ ((warn_unused_result));
+  (crypto_box_curve25519xsalsa20poly1305_BOXZEROBYTES   \
+   + crypto_box_curve25519xsalsa20poly1305_MACBYTES)
+  SODIUM_EXPORT
+  size_t
+  crypto_box_curve25519xsalsa20poly1305_zerobytes(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_curve25519xsalsa20poly1305(
+      unsigned char *c, const unsigned char *m, unsigned long long mlen,
+      const unsigned char *n, const unsigned char *pk, const unsigned char *sk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_box_curve25519xsalsa20poly1305_open(
+      unsigned char *m, const unsigned char *c, unsigned long long clen,
+      const unsigned char *n, const unsigned char *pk, const unsigned char *sk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_box_curve25519xsalsa20poly1305_afternm(unsigned char *c,
+                                                const unsigned char *m,
+                                                unsigned long long mlen,
+                                                const unsigned char *n,
+                                                const unsigned char *k);
+
+  SODIUM_EXPORT
+  int
+  crypto_box_curve25519xsalsa20poly1305_open_afternm(unsigned char *m,
+                                                     const unsigned char *c,
+                                                     unsigned long long clen,
+                                                     const unsigned char *n,
+                                                     const unsigned char *k)
+      __attribute__((warn_unused_result));

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_core_hchacha20.h
+++ b/crypto/include/sodium/crypto_core_hchacha20.h
@ -5,28 +5,34 @@
 #include "export.h"

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #define crypto_core_hchacha20_OUTPUTBYTES 32U
-SODIUM_EXPORT
-size_t crypto_core_hchacha20_outputbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_core_hchacha20_outputbytes(void);

 #define crypto_core_hchacha20_INPUTBYTES 16U
-SODIUM_EXPORT
-size_t crypto_core_hchacha20_inputbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_core_hchacha20_inputbytes(void);

 #define crypto_core_hchacha20_KEYBYTES 32U
-SODIUM_EXPORT
-size_t crypto_core_hchacha20_keybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_core_hchacha20_keybytes(void);

 #define crypto_core_hchacha20_CONSTBYTES 16U
-SODIUM_EXPORT
-size_t crypto_core_hchacha20_constbytes(void);
-
-SODIUM_EXPORT
-int crypto_core_hchacha20(unsigned char *out, const unsigned char *in,
-                          const unsigned char *k, const unsigned char *c);
+  SODIUM_EXPORT
+  size_t
+  crypto_core_hchacha20_constbytes(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_core_hchacha20(unsigned char *out, const unsigned char *in,
+                        const unsigned char *k, const unsigned char *c);

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_core_salsa20.h
+++ b/crypto/include/sodium/crypto_core_salsa20.h
@ -5,28 +5,34 @@
 #include "export.h"

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #define crypto_core_salsa20_OUTPUTBYTES 64U
-SODIUM_EXPORT
-size_t crypto_core_salsa20_outputbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_core_salsa20_outputbytes(void);

 #define crypto_core_salsa20_INPUTBYTES 16U
-SODIUM_EXPORT
-size_t crypto_core_salsa20_inputbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_core_salsa20_inputbytes(void);

 #define crypto_core_salsa20_KEYBYTES 32U
-SODIUM_EXPORT
-size_t crypto_core_salsa20_keybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_core_salsa20_keybytes(void);

 #define crypto_core_salsa20_CONSTBYTES 16U
-SODIUM_EXPORT
-size_t crypto_core_salsa20_constbytes(void);
-
-SODIUM_EXPORT
-int crypto_core_salsa20(unsigned char *out, const unsigned char *in,
-                        const unsigned char *k, const unsigned char *c);
+  SODIUM_EXPORT
+  size_t
+  crypto_core_salsa20_constbytes(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_core_salsa20(unsigned char *out, const unsigned char *in,
+                      const unsigned char *k, const unsigned char *c);

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_generichash.h
+++ b/crypto/include/sodium/crypto_generichash.h
@ -7,66 +7,79 @@
 #include "export.h"

 #ifdef __cplusplus
-# ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wlong-long"
-# endif
-extern "C" {
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wlong-long"
+#endif
+extern "C"
+{
 #endif

 #define crypto_generichash_BYTES_MIN crypto_generichash_blake2b_BYTES_MIN
-SODIUM_EXPORT
-size_t  crypto_generichash_bytes_min(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_bytes_min(void);

 #define crypto_generichash_BYTES_MAX crypto_generichash_blake2b_BYTES_MAX
-SODIUM_EXPORT
-size_t  crypto_generichash_bytes_max(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_bytes_max(void);

 #define crypto_generichash_BYTES crypto_generichash_blake2b_BYTES
-SODIUM_EXPORT
-size_t  crypto_generichash_bytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_bytes(void);

 #define crypto_generichash_KEYBYTES_MIN crypto_generichash_blake2b_KEYBYTES_MIN
-SODIUM_EXPORT
-size_t  crypto_generichash_keybytes_min(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_keybytes_min(void);

 #define crypto_generichash_KEYBYTES_MAX crypto_generichash_blake2b_KEYBYTES_MAX
-SODIUM_EXPORT
-size_t  crypto_generichash_keybytes_max(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_keybytes_max(void);

 #define crypto_generichash_KEYBYTES crypto_generichash_blake2b_KEYBYTES
-SODIUM_EXPORT
-size_t  crypto_generichash_keybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_keybytes(void);

 #define crypto_generichash_PRIMITIVE "blake2b"
-SODIUM_EXPORT
-const char *crypto_generichash_primitive(void);
-
-typedef crypto_generichash_blake2b_state crypto_generichash_state;
-
-SODIUM_EXPORT
-size_t  crypto_generichash_statebytes(void);
-
-SODIUM_EXPORT
-int crypto_generichash(unsigned char *out, size_t outlen,
-                       const unsigned char *in, unsigned long long inlen,
-                       const unsigned char *key, size_t keylen);
-
-SODIUM_EXPORT
-int crypto_generichash_init(crypto_generichash_state *state,
-                            const unsigned char *key,
-                            const size_t keylen, const size_t outlen);
-
-SODIUM_EXPORT
-int crypto_generichash_update(crypto_generichash_state *state,
-                              const unsigned char *in,
-                              unsigned long long inlen);
-
-SODIUM_EXPORT
-int crypto_generichash_final(crypto_generichash_state *state,
-                             unsigned char *out, const size_t outlen);
-
-SODIUM_EXPORT
-void crypto_generichash_keygen(unsigned char k[crypto_generichash_KEYBYTES]);
+  SODIUM_EXPORT
+  const char *
+  crypto_generichash_primitive(void);
+
+  typedef crypto_generichash_blake2b_state crypto_generichash_state;
+
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_statebytes(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_generichash(unsigned char *out, size_t outlen, const unsigned char *in,
+                     unsigned long long inlen, const unsigned char *key,
+                     size_t keylen);
+
+  SODIUM_EXPORT
+  int
+  crypto_generichash_init(crypto_generichash_state *state,
+                          const unsigned char *key, const size_t keylen,
+                          const size_t outlen);
+
+  SODIUM_EXPORT
+  int
+  crypto_generichash_update(crypto_generichash_state *state,
+                            const unsigned char *in, unsigned long long inlen);
+
+  SODIUM_EXPORT
+  int
+  crypto_generichash_final(crypto_generichash_state *state, unsigned char *out,
+                           const size_t outlen);
+
+  SODIUM_EXPORT
+  void
+  crypto_generichash_keygen(unsigned char k[crypto_generichash_KEYBYTES]);

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_generichash_blake2b.h
+++ b/crypto/include/sodium/crypto_generichash_blake2b.h
@ -8,107 +8,120 @@
 #include "export.h"

 #ifdef __cplusplus
-# ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wlong-long"
-# endif
-extern "C" {
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wlong-long"
+#endif
+extern "C"
+{
 #endif

 #if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-# pragma pack(1)
+#pragma pack(1)
 #else
-# pragma pack(push, 1)
+#pragma pack(push, 1)
 #endif

-typedef struct CRYPTO_ALIGN(64) crypto_generichash_blake2b_state {
+  typedef struct CRYPTO_ALIGN(64) crypto_generichash_blake2b_state
+  {
    uint64_t h[8];
    uint64_t t[2];
    uint64_t f[2];
-    uint8_t  buf[2 * 128];
-    size_t   buflen;
-    uint8_t  last_node;
-} crypto_generichash_blake2b_state;
+    uint8_t buf[2 * 128];
+    size_t buflen;
+    uint8_t last_node;
+  } crypto_generichash_blake2b_state;

 #if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-# pragma pack()
+#pragma pack()
 #else
-# pragma pack(pop)
+#pragma pack(pop)
 #endif

-#define crypto_generichash_blake2b_BYTES_MIN     16U
-SODIUM_EXPORT
-size_t crypto_generichash_blake2b_bytes_min(void);
-
-#define crypto_generichash_blake2b_BYTES_MAX     64U
-SODIUM_EXPORT
-size_t crypto_generichash_blake2b_bytes_max(void);
-
-#define crypto_generichash_blake2b_BYTES         32U
-SODIUM_EXPORT
-size_t crypto_generichash_blake2b_bytes(void);
-
-#define crypto_generichash_blake2b_KEYBYTES_MIN  16U
-SODIUM_EXPORT
-size_t crypto_generichash_blake2b_keybytes_min(void);
-
-#define crypto_generichash_blake2b_KEYBYTES_MAX  64U
-SODIUM_EXPORT
-size_t crypto_generichash_blake2b_keybytes_max(void);
-
-#define crypto_generichash_blake2b_KEYBYTES      32U
-SODIUM_EXPORT
-size_t crypto_generichash_blake2b_keybytes(void);
-
-#define crypto_generichash_blake2b_SALTBYTES     16U
-SODIUM_EXPORT
-size_t crypto_generichash_blake2b_saltbytes(void);
+#define crypto_generichash_blake2b_BYTES_MIN 16U
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_blake2b_bytes_min(void);
+
+#define crypto_generichash_blake2b_BYTES_MAX 64U
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_blake2b_bytes_max(void);
+
+#define crypto_generichash_blake2b_BYTES 32U
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_blake2b_bytes(void);
+
+#define crypto_generichash_blake2b_KEYBYTES_MIN 16U
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_blake2b_keybytes_min(void);
+
+#define crypto_generichash_blake2b_KEYBYTES_MAX 64U
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_blake2b_keybytes_max(void);
+
+#define crypto_generichash_blake2b_KEYBYTES 32U
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_blake2b_keybytes(void);
+
+#define crypto_generichash_blake2b_SALTBYTES 16U
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_blake2b_saltbytes(void);

 #define crypto_generichash_blake2b_PERSONALBYTES 16U
-SODIUM_EXPORT
-size_t crypto_generichash_blake2b_personalbytes(void);
-
-SODIUM_EXPORT
-size_t crypto_generichash_blake2b_statebytes(void);
-
-SODIUM_EXPORT
-int crypto_generichash_blake2b(unsigned char *out, size_t outlen,
-                               const unsigned char *in,
-                               unsigned long long inlen,
-                               const unsigned char *key, size_t keylen);
-
-SODIUM_EXPORT
-int crypto_generichash_blake2b_salt_personal(unsigned char *out, size_t outlen,
-                                             const unsigned char *in,
-                                             unsigned long long inlen,
-                                             const unsigned char *key,
-                                             size_t keylen,
-                                             const unsigned char *salt,
-                                             const unsigned char *personal);
-
-SODIUM_EXPORT
-int crypto_generichash_blake2b_init(crypto_generichash_blake2b_state *state,
-                                    const unsigned char *key,
-                                    const size_t keylen, const size_t outlen);
-
-SODIUM_EXPORT
-int crypto_generichash_blake2b_init_salt_personal(crypto_generichash_blake2b_state *state,
-                                                  const unsigned char *key,
-                                                  const size_t keylen, const size_t outlen,
-                                                  const unsigned char *salt,
-                                                  const unsigned char *personal);
-
-SODIUM_EXPORT
-int crypto_generichash_blake2b_update(crypto_generichash_blake2b_state *state,
-                                      const unsigned char *in,
-                                      unsigned long long inlen);
-
-SODIUM_EXPORT
-int crypto_generichash_blake2b_final(crypto_generichash_blake2b_state *state,
-                                     unsigned char *out,
-                                     const size_t outlen);
-
-SODIUM_EXPORT
-void crypto_generichash_blake2b_keygen(unsigned char k[crypto_generichash_blake2b_KEYBYTES]);
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_blake2b_personalbytes(void);
+
+  SODIUM_EXPORT
+  size_t
+  crypto_generichash_blake2b_statebytes(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_generichash_blake2b(unsigned char *out, size_t outlen,
+                             const unsigned char *in, unsigned long long inlen,
+                             const unsigned char *key, size_t keylen);
+
+  SODIUM_EXPORT
+  int
+  crypto_generichash_blake2b_salt_personal(
+      unsigned char *out, size_t outlen, const unsigned char *in,
+      unsigned long long inlen, const unsigned char *key, size_t keylen,
+      const unsigned char *salt, const unsigned char *personal);
+
+  SODIUM_EXPORT
+  int
+  crypto_generichash_blake2b_init(crypto_generichash_blake2b_state *state,
+                                  const unsigned char *key, const size_t keylen,
+                                  const size_t outlen);
+
+  SODIUM_EXPORT
+  int
+  crypto_generichash_blake2b_init_salt_personal(
+      crypto_generichash_blake2b_state *state, const unsigned char *key,
+      const size_t keylen, const size_t outlen, const unsigned char *salt,
+      const unsigned char *personal);
+
+  SODIUM_EXPORT
+  int
+  crypto_generichash_blake2b_update(crypto_generichash_blake2b_state *state,
+                                    const unsigned char *in,
+                                    unsigned long long inlen);
+
+  SODIUM_EXPORT
+  int
+  crypto_generichash_blake2b_final(crypto_generichash_blake2b_state *state,
+                                   unsigned char *out, const size_t outlen);
+
+  SODIUM_EXPORT
+  void
+  crypto_generichash_blake2b_keygen(
+      unsigned char k[crypto_generichash_blake2b_KEYBYTES]);

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_scalarmult.h
+++ b/crypto/include/sodium/crypto_scalarmult.h
@ -7,36 +7,41 @@
 #include "export.h"

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #define crypto_scalarmult_BYTES crypto_scalarmult_curve25519_BYTES
-SODIUM_EXPORT
-size_t  crypto_scalarmult_bytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_scalarmult_bytes(void);

 #define crypto_scalarmult_SCALARBYTES crypto_scalarmult_curve25519_SCALARBYTES
-SODIUM_EXPORT
-size_t  crypto_scalarmult_scalarbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_scalarmult_scalarbytes(void);

 #define crypto_scalarmult_PRIMITIVE "curve25519"
-SODIUM_EXPORT
-const char *crypto_scalarmult_primitive(void);
-
-SODIUM_EXPORT
-int crypto_scalarmult_base(unsigned char *q, const unsigned char *n);
-
-/*
- * NOTE: Do not use the result of this function directly.
- *
- * Hash the result with the public keys in order to compute a shared
- * secret key: H(q || client_pk || server_pk)
- *
- * Or unless this is not an option, use the crypto_kx() API instead.
- */
-SODIUM_EXPORT
-int crypto_scalarmult(unsigned char *q, const unsigned char *n,
-                      const unsigned char *p)
-            __attribute__ ((warn_unused_result));
+  SODIUM_EXPORT
+  const char *
+  crypto_scalarmult_primitive(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_scalarmult_base(unsigned char *q, const unsigned char *n);
+
+  /*
+   * NOTE: Do not use the result of this function directly.
+   *
+   * Hash the result with the public keys in order to compute a shared
+   * secret key: H(q || client_pk || server_pk)
+   *
+   * Or unless this is not an option, use the crypto_kx() API instead.
+   */
+  SODIUM_EXPORT
+  int
+  crypto_scalarmult(unsigned char *q, const unsigned char *n,
+                    const unsigned char *p) __attribute__((warn_unused_result));

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_scalarmult_curve25519.h
+++ b/crypto/include/sodium/crypto_scalarmult_curve25519.h
@ -6,32 +6,37 @@
 #include "export.h"

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #define crypto_scalarmult_curve25519_BYTES 32U
-SODIUM_EXPORT
-size_t crypto_scalarmult_curve25519_bytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_scalarmult_curve25519_bytes(void);

 #define crypto_scalarmult_curve25519_SCALARBYTES 32U
-SODIUM_EXPORT
-size_t crypto_scalarmult_curve25519_scalarbytes(void);
-
-/*
- * NOTE: Do not use the result of this function directly.
- *
- * Hash the result with the public keys in order to compute a shared
- * secret key: H(q || client_pk || server_pk)
- *
- * Or unless this is not an option, use the crypto_kx() API instead.
- */
-SODIUM_EXPORT
-int crypto_scalarmult_curve25519(unsigned char *q, const unsigned char *n,
-                                 const unsigned char *p)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_scalarmult_curve25519_base(unsigned char *q, const unsigned char *n);
+  SODIUM_EXPORT
+  size_t
+  crypto_scalarmult_curve25519_scalarbytes(void);
+
+  /*
+   * NOTE: Do not use the result of this function directly.
+   *
+   * Hash the result with the public keys in order to compute a shared
+   * secret key: H(q || client_pk || server_pk)
+   *
+   * Or unless this is not an option, use the crypto_kx() API instead.
+   */
+  SODIUM_EXPORT
+  int
+  crypto_scalarmult_curve25519(unsigned char *q, const unsigned char *n,
+                               const unsigned char *p)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_scalarmult_curve25519_base(unsigned char *q, const unsigned char *n);

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_scalarmult_ed25519.h
+++ b/crypto/include/sodium/crypto_scalarmult_ed25519.h
@ -7,32 +7,37 @@
 #include "export.h"

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #define crypto_scalarmult_ed25519_BYTES 32U
-SODIUM_EXPORT
-size_t crypto_scalarmult_ed25519_bytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_scalarmult_ed25519_bytes(void);

 #define crypto_scalarmult_ed25519_SCALARBYTES 32U
-SODIUM_EXPORT
-size_t crypto_scalarmult_ed25519_scalarbytes(void);
-
-/*
- * NOTE: Do not use the result of this function directly.
- *
- * Hash the result with the public keys in order to compute a shared
- * secret key: H(q || client_pk || server_pk)
- *
- * Or unless this is not an option, use the crypto_kx() API instead.
- */
-SODIUM_EXPORT
-int crypto_scalarmult_ed25519(unsigned char *q, const unsigned char *n,
-                              const unsigned char *p)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_scalarmult_ed25519_base(unsigned char *q, const unsigned char *n);
+  SODIUM_EXPORT
+  size_t
+  crypto_scalarmult_ed25519_scalarbytes(void);
+
+  /*
+   * NOTE: Do not use the result of this function directly.
+   *
+   * Hash the result with the public keys in order to compute a shared
+   * secret key: H(q || client_pk || server_pk)
+   *
+   * Or unless this is not an option, use the crypto_kx() API instead.
+   */
+  SODIUM_EXPORT
+  int
+  crypto_scalarmult_ed25519(unsigned char *q, const unsigned char *n,
+                            const unsigned char *p)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_scalarmult_ed25519_base(unsigned char *q, const unsigned char *n);

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_sign.h
+++ b/crypto/include/sodium/crypto_sign.h
@ -14,87 +14,102 @@
 #include "export.h"

 #ifdef __cplusplus
-# ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wlong-long"
-# endif
-extern "C" {
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wlong-long"
+#endif
+extern "C"
+{
 #endif

-typedef crypto_sign_ed25519ph_state crypto_sign_state;
+  typedef crypto_sign_ed25519ph_state crypto_sign_state;

-SODIUM_EXPORT
-size_t  crypto_sign_statebytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_statebytes(void);

 #define crypto_sign_BYTES crypto_sign_ed25519_BYTES
-SODIUM_EXPORT
-size_t  crypto_sign_bytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_bytes(void);

 #define crypto_sign_SEEDBYTES crypto_sign_ed25519_SEEDBYTES
-SODIUM_EXPORT
-size_t  crypto_sign_seedbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_seedbytes(void);

 #define crypto_sign_PUBLICKEYBYTES crypto_sign_ed25519_PUBLICKEYBYTES
-SODIUM_EXPORT
-size_t  crypto_sign_publickeybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_publickeybytes(void);

 #define crypto_sign_SECRETKEYBYTES crypto_sign_ed25519_SECRETKEYBYTES
-SODIUM_EXPORT
-size_t  crypto_sign_secretkeybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_secretkeybytes(void);

 #define crypto_sign_MESSAGEBYTES_MAX crypto_sign_ed25519_MESSAGEBYTES_MAX
-SODIUM_EXPORT
-size_t  crypto_sign_messagebytes_max(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_messagebytes_max(void);

 #define crypto_sign_PRIMITIVE "ed25519"
-SODIUM_EXPORT
-const char *crypto_sign_primitive(void);
-
-SODIUM_EXPORT
-int crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk,
-                             const unsigned char *seed);
-
-SODIUM_EXPORT
-int crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_sign(unsigned char *sm, unsigned long long *smlen_p,
-                const unsigned char *m, unsigned long long mlen,
-                const unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_sign_open(unsigned char *m, unsigned long long *mlen_p,
-                     const unsigned char *sm, unsigned long long smlen,
-                     const unsigned char *pk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_sign_detached(unsigned char *sig, unsigned long long *siglen_p,
-                         const unsigned char *m, unsigned long long mlen,
-                         const unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_sign_verify_detached(const unsigned char *sig,
-                                const unsigned char *m,
-                                unsigned long long mlen,
-                                const unsigned char *pk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_sign_init(crypto_sign_state *state);
-
-SODIUM_EXPORT
-int crypto_sign_update(crypto_sign_state *state,
-                       const unsigned char *m, unsigned long long mlen);
-
-SODIUM_EXPORT
-int crypto_sign_final_create(crypto_sign_state *state, unsigned char *sig,
-                             unsigned long long *siglen_p,
-                             const unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_sign_final_verify(crypto_sign_state *state, unsigned char *sig,
-                             const unsigned char *pk)
-            __attribute__ ((warn_unused_result));
+  SODIUM_EXPORT
+  const char *
+  crypto_sign_primitive(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk,
+                           const unsigned char *seed);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign(unsigned char *sm, unsigned long long *smlen_p,
+              const unsigned char *m, unsigned long long mlen,
+              const unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_open(unsigned char *m, unsigned long long *mlen_p,
+                   const unsigned char *sm, unsigned long long smlen,
+                   const unsigned char *pk) __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_detached(unsigned char *sig, unsigned long long *siglen_p,
+                       const unsigned char *m, unsigned long long mlen,
+                       const unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_verify_detached(const unsigned char *sig, const unsigned char *m,
+                              unsigned long long mlen, const unsigned char *pk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_init(crypto_sign_state *state);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_update(crypto_sign_state *state, const unsigned char *m,
+                     unsigned long long mlen);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_final_create(crypto_sign_state *state, unsigned char *sig,
+                           unsigned long long *siglen_p,
+                           const unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_final_verify(crypto_sign_state *state, unsigned char *sig,
+                           const unsigned char *pk)
+      __attribute__((warn_unused_result));

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_sign_ed25519.h
+++ b/crypto/include/sodium/crypto_sign_ed25519.h
@ -6,106 +6,125 @@
 #include "export.h"

 #ifdef __cplusplus
-# ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wlong-long"
-# endif
-extern "C" {
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wlong-long"
+#endif
+extern "C"
+{
 #endif

-typedef struct crypto_sign_ed25519ph_state {
+  typedef struct crypto_sign_ed25519ph_state
+  {
    crypto_hash_sha512_state hs;
-} crypto_sign_ed25519ph_state;
+  } crypto_sign_ed25519ph_state;

-SODIUM_EXPORT
-size_t crypto_sign_ed25519ph_statebytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_ed25519ph_statebytes(void);

 #define crypto_sign_ed25519_BYTES 64U
-SODIUM_EXPORT
-size_t crypto_sign_ed25519_bytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_ed25519_bytes(void);

 #define crypto_sign_ed25519_SEEDBYTES 32U
-SODIUM_EXPORT
-size_t crypto_sign_ed25519_seedbytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_ed25519_seedbytes(void);

 #define crypto_sign_ed25519_PUBLICKEYBYTES 32U
-SODIUM_EXPORT
-size_t crypto_sign_ed25519_publickeybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_ed25519_publickeybytes(void);

 #define crypto_sign_ed25519_SECRETKEYBYTES (32U + 32U)
-SODIUM_EXPORT
-size_t crypto_sign_ed25519_secretkeybytes(void);
-
-#define crypto_sign_ed25519_MESSAGEBYTES_MAX (SODIUM_SIZE_MAX - crypto_sign_ed25519_BYTES)
-SODIUM_EXPORT
-size_t crypto_sign_ed25519_messagebytes_max(void);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519(unsigned char *sm, unsigned long long *smlen_p,
-                        const unsigned char *m, unsigned long long mlen,
-                        const unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519_open(unsigned char *m, unsigned long long *mlen_p,
-                             const unsigned char *sm, unsigned long long smlen,
-                             const unsigned char *pk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_sign_ed25519_detached(unsigned char *sig,
-                                 unsigned long long *siglen_p,
-                                 const unsigned char *m,
-                                 unsigned long long mlen,
-                                 const unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519_verify_detached(const unsigned char *sig,
-                                        const unsigned char *m,
-                                        unsigned long long mlen,
-                                        const unsigned char *pk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_sign_ed25519_keypair(unsigned char *pk, unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519_seed_keypair(unsigned char *pk, unsigned char *sk,
-                                     const unsigned char *seed);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519_pk_to_curve25519(unsigned char *curve25519_pk,
-                                         const unsigned char *ed25519_pk)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int crypto_sign_ed25519_sk_to_curve25519(unsigned char *curve25519_sk,
-                                         const unsigned char *ed25519_sk);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519_sk_to_seed(unsigned char *seed,
-                                   const unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519_sk_to_pk(unsigned char *pk, const unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519ph_init(crypto_sign_ed25519ph_state *state);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519ph_update(crypto_sign_ed25519ph_state *state,
-                                 const unsigned char *m,
-                                 unsigned long long mlen);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519ph_final_create(crypto_sign_ed25519ph_state *state,
-                                       unsigned char *sig,
-                                       unsigned long long *siglen_p,
-                                       const unsigned char *sk);
-
-SODIUM_EXPORT
-int crypto_sign_ed25519ph_final_verify(crypto_sign_ed25519ph_state *state,
-                                       unsigned char *sig,
-                                       const unsigned char *pk)
-            __attribute__ ((warn_unused_result));
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_ed25519_secretkeybytes(void);
+
+#define crypto_sign_ed25519_MESSAGEBYTES_MAX \
+  (SODIUM_SIZE_MAX - crypto_sign_ed25519_BYTES)
+  SODIUM_EXPORT
+  size_t
+  crypto_sign_ed25519_messagebytes_max(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519(unsigned char *sm, unsigned long long *smlen_p,
+                      const unsigned char *m, unsigned long long mlen,
+                      const unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519_open(unsigned char *m, unsigned long long *mlen_p,
+                           const unsigned char *sm, unsigned long long smlen,
+                           const unsigned char *pk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519_detached(unsigned char *sig, unsigned long long *siglen_p,
+                               const unsigned char *m, unsigned long long mlen,
+                               const unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519_verify_detached(const unsigned char *sig,
+                                      const unsigned char *m,
+                                      unsigned long long mlen,
+                                      const unsigned char *pk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519_keypair(unsigned char *pk, unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519_seed_keypair(unsigned char *pk, unsigned char *sk,
+                                   const unsigned char *seed);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519_pk_to_curve25519(unsigned char *curve25519_pk,
+                                       const unsigned char *ed25519_pk)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519_sk_to_curve25519(unsigned char *curve25519_sk,
+                                       const unsigned char *ed25519_sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519_sk_to_seed(unsigned char *seed, const unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519_sk_to_pk(unsigned char *pk, const unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519ph_init(crypto_sign_ed25519ph_state *state);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519ph_update(crypto_sign_ed25519ph_state *state,
+                               const unsigned char *m, unsigned long long mlen);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519ph_final_create(crypto_sign_ed25519ph_state *state,
+                                     unsigned char *sig,
+                                     unsigned long long *siglen_p,
+                                     const unsigned char *sk);
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_ed25519ph_final_verify(crypto_sign_ed25519ph_state *state,
+                                     unsigned char *sig,
+                                     const unsigned char *pk)
+      __attribute__((warn_unused_result));

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_sign_edwards25519sha512batch.h
+++ b/crypto/include/sodium/crypto_sign_edwards25519sha512batch.h
@ -16,37 +16,42 @@
 #include "export.h"

 #ifdef __cplusplus
-# ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wlong-long"
-# endif
-extern "C" {
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wlong-long"
+#endif
+extern "C"
+{
 #endif

 #define crypto_sign_edwards25519sha512batch_BYTES 64U
 #define crypto_sign_edwards25519sha512batch_PUBLICKEYBYTES 32U
 #define crypto_sign_edwards25519sha512batch_SECRETKEYBYTES (32U + 32U)
-#define crypto_sign_edwards25519sha512batch_MESSAGEBYTES_MAX (SODIUM_SIZE_MAX - crypto_sign_edwards25519sha512batch_BYTES)
-
-SODIUM_EXPORT
-int crypto_sign_edwards25519sha512batch(unsigned char *sm,
-                                        unsigned long long *smlen_p,
-                                        const unsigned char *m,
-                                        unsigned long long mlen,
-                                        const unsigned char *sk)
-       __attribute__ ((deprecated));
-
-SODIUM_EXPORT
-int crypto_sign_edwards25519sha512batch_open(unsigned char *m,
-                                             unsigned long long *mlen_p,
-                                             const unsigned char *sm,
-                                             unsigned long long smlen,
-                                             const unsigned char *pk)
-       __attribute__ ((deprecated));
-
-SODIUM_EXPORT
-int crypto_sign_edwards25519sha512batch_keypair(unsigned char *pk,
-                                                unsigned char *sk)
-       __attribute__ ((deprecated));
+#define crypto_sign_edwards25519sha512batch_MESSAGEBYTES_MAX \
+  (SODIUM_SIZE_MAX - crypto_sign_edwards25519sha512batch_BYTES)
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_edwards25519sha512batch(unsigned char *sm,
+                                      unsigned long long *smlen_p,
+                                      const unsigned char *m,
+                                      unsigned long long mlen,
+                                      const unsigned char *sk)
+      __attribute__((deprecated));
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_edwards25519sha512batch_open(unsigned char *m,
+                                           unsigned long long *mlen_p,
+                                           const unsigned char *sm,
+                                           unsigned long long smlen,
+                                           const unsigned char *pk)
+      __attribute__((deprecated));
+
+  SODIUM_EXPORT
+  int
+  crypto_sign_edwards25519sha512batch_keypair(unsigned char *pk,
+                                              unsigned char *sk)
+      __attribute__((deprecated));

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_stream_chacha20.h
+++ b/crypto/include/sodium/crypto_stream_chacha20.h
@ -14,82 +14,103 @@
 #include "export.h"

 #ifdef __cplusplus
-# ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wlong-long"
-# endif
-extern "C" {
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wlong-long"
+#endif
+extern "C"
+{
 #endif

 #define crypto_stream_chacha20_KEYBYTES 32U
-SODIUM_EXPORT
-size_t crypto_stream_chacha20_keybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_chacha20_keybytes(void);

 #define crypto_stream_chacha20_NONCEBYTES 8U
-SODIUM_EXPORT
-size_t crypto_stream_chacha20_noncebytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_chacha20_noncebytes(void);

 #define crypto_stream_chacha20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
-SODIUM_EXPORT
-size_t crypto_stream_chacha20_messagebytes_max(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_chacha20_messagebytes_max(void);

-/* ChaCha20 with a 64-bit nonce and a 64-bit counter, as originally designed */
+  /* ChaCha20 with a 64-bit nonce and a 64-bit counter, as originally designed
+   */

-SODIUM_EXPORT
-int crypto_stream_chacha20(unsigned char *c, unsigned long long clen,
-                           const unsigned char *n, const unsigned char *k);
+  SODIUM_EXPORT
+  int
+  crypto_stream_chacha20(unsigned char *c, unsigned long long clen,
+                         const unsigned char *n, const unsigned char *k);

-SODIUM_EXPORT
-int crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m,
-                               unsigned long long mlen, const unsigned char *n,
-                               const unsigned char *k);
+  SODIUM_EXPORT
+  int
+  crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m,
+                             unsigned long long mlen, const unsigned char *n,
+                             const unsigned char *k);

-SODIUM_EXPORT
-int crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m,
-                                  unsigned long long mlen,
-                                  const unsigned char *n, uint64_t ic,
-                                  const unsigned char *k);
+  SODIUM_EXPORT
+  int
+  crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m,
+                                unsigned long long mlen, const unsigned char *n,
+                                uint64_t ic, const unsigned char *k);

-SODIUM_EXPORT
-void crypto_stream_chacha20_keygen(unsigned char k[crypto_stream_chacha20_KEYBYTES]);
+  SODIUM_EXPORT
+  void
+  crypto_stream_chacha20_keygen(
+      unsigned char k[crypto_stream_chacha20_KEYBYTES]);

-/* ChaCha20 with a 96-bit nonce and a 32-bit counter (IETF) */
+  /* ChaCha20 with a 96-bit nonce and a 32-bit counter (IETF) */

 #define crypto_stream_chacha20_ietf_KEYBYTES 32U
-SODIUM_EXPORT
-size_t crypto_stream_chacha20_ietf_keybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_chacha20_ietf_keybytes(void);

 #define crypto_stream_chacha20_ietf_NONCEBYTES 12U
-SODIUM_EXPORT
-size_t crypto_stream_chacha20_ietf_noncebytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_chacha20_ietf_noncebytes(void);

 #define crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX \
-    SODIUM_MIN(SODIUM_SIZE_MAX, 64ULL * (1ULL << 32))
-SODIUM_EXPORT
-size_t crypto_stream_chacha20_ietf_messagebytes_max(void);
-
-SODIUM_EXPORT
-int crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen,
-                                const unsigned char *n, const unsigned char *k);
-
-SODIUM_EXPORT
-int crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m,
-                                    unsigned long long mlen, const unsigned char *n,
-                                    const unsigned char *k);
-
-SODIUM_EXPORT
-int crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m,
-                                       unsigned long long mlen,
-                                       const unsigned char *n, uint32_t ic,
-                                       const unsigned char *k);
-
-SODIUM_EXPORT
-void crypto_stream_chacha20_ietf_keygen(unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES]);
-
-/* Aliases */
+  SODIUM_MIN(SODIUM_SIZE_MAX, 64ULL * (1ULL << 32))
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_chacha20_ietf_messagebytes_max(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen,
+                              const unsigned char *n, const unsigned char *k);
+
+  SODIUM_EXPORT
+  int
+  crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m,
+                                  unsigned long long mlen,
+                                  const unsigned char *n,
+                                  const unsigned char *k);

-#define crypto_stream_chacha20_IETF_KEYBYTES crypto_stream_chacha20_ietf_KEYBYTES
-#define crypto_stream_chacha20_IETF_NONCEBYTES crypto_stream_chacha20_ietf_NONCEBYTES
-#define crypto_stream_chacha20_IETF_MESSAGEBYTES_MAX crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX
+  SODIUM_EXPORT
+  int
+  crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m,
+                                     unsigned long long mlen,
+                                     const unsigned char *n, uint32_t ic,
+                                     const unsigned char *k);
+
+  SODIUM_EXPORT
+  void
+  crypto_stream_chacha20_ietf_keygen(
+      unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES]);
+
+  /* Aliases */
+
+#define crypto_stream_chacha20_IETF_KEYBYTES \
+  crypto_stream_chacha20_ietf_KEYBYTES
+#define crypto_stream_chacha20_IETF_NONCEBYTES \
+  crypto_stream_chacha20_ietf_NONCEBYTES
+#define crypto_stream_chacha20_IETF_MESSAGEBYTES_MAX \
+  crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_stream_salsa20.h
+++ b/crypto/include/sodium/crypto_stream_salsa20.h
@ -14,41 +14,48 @@
 #include "export.h"

 #ifdef __cplusplus
-# ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wlong-long"
-# endif
-extern "C" {
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wlong-long"
+#endif
+extern "C"
+{
 #endif

 #define crypto_stream_salsa20_KEYBYTES 32U
-SODIUM_EXPORT
-size_t crypto_stream_salsa20_keybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_salsa20_keybytes(void);

 #define crypto_stream_salsa20_NONCEBYTES 8U
-SODIUM_EXPORT
-size_t crypto_stream_salsa20_noncebytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_salsa20_noncebytes(void);

 #define crypto_stream_salsa20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
-SODIUM_EXPORT
-size_t crypto_stream_salsa20_messagebytes_max(void);
-
-SODIUM_EXPORT
-int crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
-                          const unsigned char *n, const unsigned char *k);
-
-SODIUM_EXPORT
-int crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
-                              unsigned long long mlen, const unsigned char *n,
-                              const unsigned char *k);
-
-SODIUM_EXPORT
-int crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
-                                 unsigned long long mlen,
-                                 const unsigned char *n, uint64_t ic,
-                                 const unsigned char *k);
-
-SODIUM_EXPORT
-void crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_salsa20_messagebytes_max(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
+                        const unsigned char *n, const unsigned char *k);
+
+  SODIUM_EXPORT
+  int
+  crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
+                            unsigned long long mlen, const unsigned char *n,
+                            const unsigned char *k);
+
+  SODIUM_EXPORT
+  int
+  crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
+                               unsigned long long mlen, const unsigned char *n,
+                               uint64_t ic, const unsigned char *k);
+
+  SODIUM_EXPORT
+  void
+  crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]);

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/crypto_stream_xsalsa20.h
+++ b/crypto/include/sodium/crypto_stream_xsalsa20.h
@ -14,41 +14,49 @@
 #include "export.h"

 #ifdef __cplusplus
-# ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wlong-long"
-# endif
-extern "C" {
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wlong-long"
+#endif
+extern "C"
+{
 #endif

 #define crypto_stream_xsalsa20_KEYBYTES 32U
-SODIUM_EXPORT
-size_t crypto_stream_xsalsa20_keybytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_xsalsa20_keybytes(void);

 #define crypto_stream_xsalsa20_NONCEBYTES 24U
-SODIUM_EXPORT
-size_t crypto_stream_xsalsa20_noncebytes(void);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_xsalsa20_noncebytes(void);

 #define crypto_stream_xsalsa20_MESSAGEBYTES_MAX SODIUM_SIZE_MAX
-SODIUM_EXPORT
-size_t crypto_stream_xsalsa20_messagebytes_max(void);
-
-SODIUM_EXPORT
-int crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen,
-                           const unsigned char *n, const unsigned char *k);
-
-SODIUM_EXPORT
-int crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m,
-                               unsigned long long mlen, const unsigned char *n,
-                               const unsigned char *k);
-
-SODIUM_EXPORT
-int crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m,
-                                  unsigned long long mlen,
-                                  const unsigned char *n, uint64_t ic,
-                                  const unsigned char *k);
-
-SODIUM_EXPORT
-void crypto_stream_xsalsa20_keygen(unsigned char k[crypto_stream_xsalsa20_KEYBYTES]);
+  SODIUM_EXPORT
+  size_t
+  crypto_stream_xsalsa20_messagebytes_max(void);
+
+  SODIUM_EXPORT
+  int
+  crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen,
+                         const unsigned char *n, const unsigned char *k);
+
+  SODIUM_EXPORT
+  int
+  crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m,
+                             unsigned long long mlen, const unsigned char *n,
+                             const unsigned char *k);
+
+  SODIUM_EXPORT
+  int
+  crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m,
+                                unsigned long long mlen, const unsigned char *n,
+                                uint64_t ic, const unsigned char *k);
+
+  SODIUM_EXPORT
+  void
+  crypto_stream_xsalsa20_keygen(
+      unsigned char k[crypto_stream_xsalsa20_KEYBYTES]);

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/export.h
+++ b/crypto/include/sodium/export.h
@ -7,48 +7,48 @@
 #include <limits.h>

 #if !defined(__clang__) && !defined(__GNUC__)
-# ifdef __attribute__
-#  undef __attribute__
-# endif
-# define __attribute__(a)
+#ifdef __attribute__
+#undef __attribute__
+#endif
+#define __attribute__(a)
 #endif

 #ifdef SODIUM_STATIC
-# define SODIUM_EXPORT
-# define SODIUM_EXPORT_WEAK
+#define SODIUM_EXPORT
+#define SODIUM_EXPORT_WEAK
+#else
+#if defined(_MSC_VER)
+#ifdef SODIUM_DLL_EXPORT
+#define SODIUM_EXPORT __declspec(dllexport)
+#else
+#define SODIUM_EXPORT __declspec(dllimport)
+#endif
+#else
+#if defined(__SUNPRO_C)
+#ifndef __GNU_C__
+#define SODIUM_EXPORT __attribute__(visibility(__global))
+#else
+#define SODIUM_EXPORT __attribute__ __global
+#endif
+#elif defined(_MSG_VER)
+#define SODIUM_EXPORT extern __declspec(dllexport)
 #else
-# if defined(_MSC_VER)
-#  ifdef SODIUM_DLL_EXPORT
-#   define SODIUM_EXPORT __declspec(dllexport)
-#  else
-#   define SODIUM_EXPORT __declspec(dllimport)
-#  endif
-# else
-#  if defined(__SUNPRO_C)
-#   ifndef __GNU_C__
-#    define SODIUM_EXPORT __attribute__ (visibility(__global))
-#   else
-#    define SODIUM_EXPORT __attribute__ __global
-#   endif
-#  elif defined(_MSG_VER)
-#   define SODIUM_EXPORT extern __declspec(dllexport)
-#  else
-#   define SODIUM_EXPORT __attribute__ ((visibility ("default")))
-#  endif
-# endif
-# if defined(__ELF__) && !defined(SODIUM_DISABLE_WEAK_FUNCTIONS)
-#  define SODIUM_EXPORT_WEAK SODIUM_EXPORT __attribute__((weak))
-# else
-#  define SODIUM_EXPORT_WEAK SODIUM_EXPORT
-# endif
+#define SODIUM_EXPORT __attribute__((visibility("default")))
+#endif
+#endif
+#if defined(__ELF__) && !defined(SODIUM_DISABLE_WEAK_FUNCTIONS)
+#define SODIUM_EXPORT_WEAK SODIUM_EXPORT __attribute__((weak))
+#else
+#define SODIUM_EXPORT_WEAK SODIUM_EXPORT
+#endif
 #endif

 #ifndef CRYPTO_ALIGN
-# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
-#  define CRYPTO_ALIGN(x) __declspec(align(x))
-# else
-#  define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
-# endif
+#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
+#define CRYPTO_ALIGN(x) __declspec(align(x))
+#else
+#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
+#endif
 #endif

 #define SODIUM_MIN(A, B) ((A) < (B) ? (A) : (B))
--- a/crypto/include/sodium/private/common.h
+++ b/crypto/include/sodium/private/common.h
@ -5,42 +5,42 @@
 #include <stdlib.h>
 #include <string.h>

-#define COMPILER_ASSERT(X) (void) sizeof(char[(X) ? 1 : -1])
+#define COMPILER_ASSERT(X) (void)sizeof(char[(X) ? 1 : -1])

 #ifdef HAVE_TI_MODE
-# if defined(__SIZEOF_INT128__)
+#if defined(__SIZEOF_INT128__)
 typedef unsigned __int128 uint128_t;
-# else
+#else
 typedef unsigned uint128_t __attribute__((mode(TI)));
-# endif
+#endif
 #endif

 #define ROTL32(X, B) rotl32((X), (B))
 static inline uint32_t
 rotl32(const uint32_t x, const int b)
 {
-    return (x << b) | (x >> (32 - b));
+  return (x << b) | (x >> (32 - b));
 }

 #define ROTL64(X, B) rotl64((X), (B))
 static inline uint64_t
 rotl64(const uint64_t x, const int b)
 {
-    return (x << b) | (x >> (64 - b));
+  return (x << b) | (x >> (64 - b));
 }

 #define ROTR32(X, B) rotr32((X), (B))
 static inline uint32_t
 rotr32(const uint32_t x, const int b)
 {
-    return (x >> b) | (x << (32 - b));
+  return (x >> b) | (x << (32 - b));
 }

 #define ROTR64(X, B) rotr64((X), (B))
 static inline uint64_t
 rotr64(const uint64_t x, const int b)
 {
-    return (x >> b) | (x << (64 - b));
+  return (x >> b) | (x << (64 - b));
 }

 #define LOAD64_LE(SRC) load64_le(SRC)
@ -48,19 +48,19 @@ static inline uint64_t
 load64_le(const uint8_t src[8])
 {
 #ifdef NATIVE_LITTLE_ENDIAN
-    uint64_t w;
-    memcpy(&w, src, sizeof w);
-    return w;
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
 #else
-    uint64_t w = (uint64_t) src[0];
-    w |= (uint64_t) src[1] <<  8;
-    w |= (uint64_t) src[2] << 16;
-    w |= (uint64_t) src[3] << 24;
-    w |= (uint64_t) src[4] << 32;
-    w |= (uint64_t) src[5] << 40;
-    w |= (uint64_t) src[6] << 48;
-    w |= (uint64_t) src[7] << 56;
-    return w;
+  uint64_t w = (uint64_t)src[0];
+  w |= (uint64_t)src[1] << 8;
+  w |= (uint64_t)src[2] << 16;
+  w |= (uint64_t)src[3] << 24;
+  w |= (uint64_t)src[4] << 32;
+  w |= (uint64_t)src[5] << 40;
+  w |= (uint64_t)src[6] << 48;
+  w |= (uint64_t)src[7] << 56;
+  return w;
 #endif
 }

@ -69,16 +69,23 @@ static inline void
 store64_le(uint8_t dst[8], uint64_t w)
 {
 #ifdef NATIVE_LITTLE_ENDIAN
-    memcpy(dst, &w, sizeof w);
+  memcpy(dst, &w, sizeof w);
 #else
-    dst[0] = (uint8_t) w; w >>= 8;
-    dst[1] = (uint8_t) w; w >>= 8;
-    dst[2] = (uint8_t) w; w >>= 8;
-    dst[3] = (uint8_t) w; w >>= 8;
-    dst[4] = (uint8_t) w; w >>= 8;
-    dst[5] = (uint8_t) w; w >>= 8;
-    dst[6] = (uint8_t) w; w >>= 8;
-    dst[7] = (uint8_t) w;
+  dst[0] = (uint8_t)w;
+  w >>= 8;
+  dst[1] = (uint8_t)w;
+  w >>= 8;
+  dst[2] = (uint8_t)w;
+  w >>= 8;
+  dst[3] = (uint8_t)w;
+  w >>= 8;
+  dst[4] = (uint8_t)w;
+  w >>= 8;
+  dst[5] = (uint8_t)w;
+  w >>= 8;
+  dst[6] = (uint8_t)w;
+  w >>= 8;
+  dst[7]     = (uint8_t)w;
 #endif
 }

@ -87,15 +94,15 @@ static inline uint32_t
 load32_le(const uint8_t src[4])
 {
 #ifdef NATIVE_LITTLE_ENDIAN
-    uint32_t w;
-    memcpy(&w, src, sizeof w);
-    return w;
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
 #else
-    uint32_t w = (uint32_t) src[0];
-    w |= (uint32_t) src[1] <<  8;
-    w |= (uint32_t) src[2] << 16;
-    w |= (uint32_t) src[3] << 24;
-    return w;
+  uint32_t w = (uint32_t)src[0];
+  w |= (uint32_t)src[1] << 8;
+  w |= (uint32_t)src[2] << 16;
+  w |= (uint32_t)src[3] << 24;
+  return w;
 #endif
 }

@ -104,12 +111,15 @@ static inline void
 store32_le(uint8_t dst[4], uint32_t w)
 {
 #ifdef NATIVE_LITTLE_ENDIAN
-    memcpy(dst, &w, sizeof w);
+  memcpy(dst, &w, sizeof w);
 #else
-    dst[0] = (uint8_t) w; w >>= 8;
-    dst[1] = (uint8_t) w; w >>= 8;
-    dst[2] = (uint8_t) w; w >>= 8;
-    dst[3] = (uint8_t) w;
+  dst[0] = (uint8_t)w;
+  w >>= 8;
+  dst[1] = (uint8_t)w;
+  w >>= 8;
+  dst[2] = (uint8_t)w;
+  w >>= 8;
+  dst[3]     = (uint8_t)w;
 #endif
 }

@ -120,19 +130,19 @@ static inline uint64_t
 load64_be(const uint8_t src[8])
 {
 #ifdef NATIVE_BIG_ENDIAN
-    uint64_t w;
-    memcpy(&w, src, sizeof w);
-    return w;
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
 #else
-    uint64_t w = (uint64_t) src[7];
-    w |= (uint64_t) src[6] <<  8;
-    w |= (uint64_t) src[5] << 16;
-    w |= (uint64_t) src[4] << 24;
-    w |= (uint64_t) src[3] << 32;
-    w |= (uint64_t) src[2] << 40;
-    w |= (uint64_t) src[1] << 48;
-    w |= (uint64_t) src[0] << 56;
-    return w;
+  uint64_t w = (uint64_t)src[7];
+  w |= (uint64_t)src[6] << 8;
+  w |= (uint64_t)src[5] << 16;
+  w |= (uint64_t)src[4] << 24;
+  w |= (uint64_t)src[3] << 32;
+  w |= (uint64_t)src[2] << 40;
+  w |= (uint64_t)src[1] << 48;
+  w |= (uint64_t)src[0] << 56;
+  return w;
 #endif
 }

@ -141,16 +151,23 @@ static inline void
 store64_be(uint8_t dst[8], uint64_t w)
 {
 #ifdef NATIVE_BIG_ENDIAN
-    memcpy(dst, &w, sizeof w);
+  memcpy(dst, &w, sizeof w);
 #else
-    dst[7] = (uint8_t) w; w >>= 8;
-    dst[6] = (uint8_t) w; w >>= 8;
-    dst[5] = (uint8_t) w; w >>= 8;
-    dst[4] = (uint8_t) w; w >>= 8;
-    dst[3] = (uint8_t) w; w >>= 8;
-    dst[2] = (uint8_t) w; w >>= 8;
-    dst[1] = (uint8_t) w; w >>= 8;
-    dst[0] = (uint8_t) w;
+  dst[7] = (uint8_t)w;
+  w >>= 8;
+  dst[6] = (uint8_t)w;
+  w >>= 8;
+  dst[5] = (uint8_t)w;
+  w >>= 8;
+  dst[4] = (uint8_t)w;
+  w >>= 8;
+  dst[3] = (uint8_t)w;
+  w >>= 8;
+  dst[2] = (uint8_t)w;
+  w >>= 8;
+  dst[1] = (uint8_t)w;
+  w >>= 8;
+  dst[0]     = (uint8_t)w;
 #endif
 }

@ -159,15 +176,15 @@ static inline uint32_t
 load32_be(const uint8_t src[4])
 {
 #ifdef NATIVE_BIG_ENDIAN
-    uint32_t w;
-    memcpy(&w, src, sizeof w);
-    return w;
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
 #else
-    uint32_t w = (uint32_t) src[3];
-    w |= (uint32_t) src[2] <<  8;
-    w |= (uint32_t) src[1] << 16;
-    w |= (uint32_t) src[0] << 24;
-    return w;
+  uint32_t w = (uint32_t)src[3];
+  w |= (uint32_t)src[2] << 8;
+  w |= (uint32_t)src[1] << 16;
+  w |= (uint32_t)src[0] << 24;
+  return w;
 #endif
 }

@ -176,12 +193,15 @@ static inline void
 store32_be(uint8_t dst[4], uint32_t w)
 {
 #ifdef NATIVE_BIG_ENDIAN
-    memcpy(dst, &w, sizeof w);
+  memcpy(dst, &w, sizeof w);
 #else
-    dst[3] = (uint8_t) w; w >>= 8;
-    dst[2] = (uint8_t) w; w >>= 8;
-    dst[1] = (uint8_t) w; w >>= 8;
-    dst[0] = (uint8_t) w;
+  dst[3] = (uint8_t)w;
+  w >>= 8;
+  dst[2] = (uint8_t)w;
+  w >>= 8;
+  dst[1] = (uint8_t)w;
+  w >>= 8;
+  dst[0] = (uint8_t)w;
 #endif
 }

@ -189,58 +209,61 @@ store32_be(uint8_t dst[4], uint32_t w)
 static inline void
 xor_buf(unsigned char *out, const unsigned char *in, size_t n)
 {
-    size_t i;
+  size_t i;

-    for (i = 0; i < n; i++) {
-        out[i] ^= in[i];
-    }
+  for(i = 0; i < n; i++)
+  {
+    out[i] ^= in[i];
+  }
 }

 #if !defined(__clang__) && !defined(__GNUC__)
-# ifdef __attribute__
-#  undef __attribute__
-# endif
-# define __attribute__(a)
+#ifdef __attribute__
+#undef __attribute__
+#endif
+#define __attribute__(a)
 #endif

 #ifndef CRYPTO_ALIGN
-# if defined(__INTEL_COMPILER) || defined(_MSC_VER)
-#  define CRYPTO_ALIGN(x) __declspec(align(x))
-# else
-#  define CRYPTO_ALIGN(x) __attribute__ ((aligned(x)))
-# endif
-#endif
-
-#if defined(_MSC_VER) && \
-    (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
-
-# include <intrin.h>
-
-# define HAVE_INTRIN_H    1
-# define HAVE_MMINTRIN_H  1
-# define HAVE_EMMINTRIN_H 1
-# define HAVE_PMMINTRIN_H 1
-# define HAVE_TMMINTRIN_H 1
-# define HAVE_SMMINTRIN_H 1
-# define HAVE_AVXINTRIN_H 1
-# if _MSC_VER >= 1600
-#  define HAVE_WMMINTRIN_H 1
-# endif
-# if _MSC_VER >= 1700 && defined(_M_X64)
-#  define HAVE_AVX2INTRIN_H 1
-# endif
+#if defined(__INTEL_COMPILER) || defined(_MSC_VER)
+#define CRYPTO_ALIGN(x) __declspec(align(x))
+#else
+#define CRYPTO_ALIGN(x) __attribute__((aligned(x)))
+#endif
+#endif
+
+#if defined(_MSC_VER) \
+    && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
+
+#include <intrin.h>
+
+#define HAVE_INTRIN_H 1
+#define HAVE_MMINTRIN_H 1
+#define HAVE_EMMINTRIN_H 1
+#define HAVE_PMMINTRIN_H 1
+#define HAVE_TMMINTRIN_H 1
+#define HAVE_SMMINTRIN_H 1
+#define HAVE_AVXINTRIN_H 1
+#if _MSC_VER >= 1600
+#define HAVE_WMMINTRIN_H 1
+#endif
+#if _MSC_VER >= 1700 && defined(_M_X64)
+#define HAVE_AVX2INTRIN_H 1
+#endif
 #elif defined(HAVE_INTRIN_H)
-# include <intrin.h>
+#include <intrin.h>
 #endif

 #ifdef HAVE_LIBCTGRIND
-extern void ct_poison  (const void *, size_t);
-extern void ct_unpoison(const void *, size_t);
-# define POISON(X, L)   ct_poison((X), (L))
-# define UNPOISON(X, L) ct_unpoison((X), (L))
+extern void
+ct_poison(const void *, size_t);
+extern void
+ct_unpoison(const void *, size_t);
+#define POISON(X, L) ct_poison((X), (L))
+#define UNPOISON(X, L) ct_unpoison((X), (L))
 #else
-# define POISON(X, L)   (void) 0
-# define UNPOISON(X, L) (void) 0
+#define POISON(X, L) (void)0
+#define UNPOISON(X, L) (void)0
 #endif

 #endif
--- a/crypto/include/sodium/private/ed25519_ref10.h
+++ b/crypto/include/sodium/private/ed25519_ref10.h
@ -15,17 +15,19 @@ typedef uint64_t fe25519[5];
 typedef int32_t fe25519[10];
 #endif

-void fe25519_invert(fe25519 out, const fe25519 z);
-void fe25519_frombytes(fe25519 h, const unsigned char *s);
-void fe25519_tobytes(unsigned char *s, const fe25519 h);
+void
+fe25519_invert(fe25519 out, const fe25519 z);
+void
+fe25519_frombytes(fe25519 h, const unsigned char *s);
+void
+fe25519_tobytes(unsigned char *s, const fe25519 h);

 #ifdef HAVE_TI_MODE
-# include "ed25519_ref10_fe_51.h"
+#include "ed25519_ref10_fe_51.h"
 #else
-# include "ed25519_ref10_fe_25_5.h"
+#include "ed25519_ref10_fe_25_5.h"
 #endif

-
 /*
 ge means group element.

@ -40,86 +42,109 @@ void fe25519_tobytes(unsigned char *s, const fe25519 h);
 ge25519_precomp (Duif): (y+x,y-x,2dxy)
 */

-typedef struct {
-    fe25519 X;
-    fe25519 Y;
-    fe25519 Z;
+typedef struct
+{
+  fe25519 X;
+  fe25519 Y;
+  fe25519 Z;
 } ge25519_p2;

-typedef struct {
-    fe25519 X;
-    fe25519 Y;
-    fe25519 Z;
-    fe25519 T;
+typedef struct
+{
+  fe25519 X;
+  fe25519 Y;
+  fe25519 Z;
+  fe25519 T;
 } ge25519_p3;

-typedef struct {
-    fe25519 X;
-    fe25519 Y;
-    fe25519 Z;
-    fe25519 T;
+typedef struct
+{
+  fe25519 X;
+  fe25519 Y;
+  fe25519 Z;
+  fe25519 T;
 } ge25519_p1p1;

-typedef struct {
-    fe25519 yplusx;
-    fe25519 yminusx;
-    fe25519 xy2d;
+typedef struct
+{
+  fe25519 yplusx;
+  fe25519 yminusx;
+  fe25519 xy2d;
 } ge25519_precomp;

-typedef struct {
-    fe25519 YplusX;
-    fe25519 YminusX;
-    fe25519 Z;
-    fe25519 T2d;
+typedef struct
+{
+  fe25519 YplusX;
+  fe25519 YminusX;
+  fe25519 Z;
+  fe25519 T2d;
 } ge25519_cached;

-void ge25519_tobytes(unsigned char *s, const ge25519_p2 *h);
+void
+ge25519_tobytes(unsigned char *s, const ge25519_p2 *h);

-void ge25519_p3_tobytes(unsigned char *s, const ge25519_p3 *h);
+void
+ge25519_p3_tobytes(unsigned char *s, const ge25519_p3 *h);

-int ge25519_frombytes(ge25519_p3 *h, const unsigned char *s);
+int
+ge25519_frombytes(ge25519_p3 *h, const unsigned char *s);

-int ge25519_frombytes_negate_vartime(ge25519_p3 *h, const unsigned char *s);
+int
+ge25519_frombytes_negate_vartime(ge25519_p3 *h, const unsigned char *s);

-void ge25519_p3_to_cached(ge25519_cached *r, const ge25519_p3 *p);
+void
+ge25519_p3_to_cached(ge25519_cached *r, const ge25519_p3 *p);

-void ge25519_p1p1_to_p2(ge25519_p2 *r, const ge25519_p1p1 *p);
+void
+ge25519_p1p1_to_p2(ge25519_p2 *r, const ge25519_p1p1 *p);

-void ge25519_p1p1_to_p3(ge25519_p3 *r, const ge25519_p1p1 *p);
+void
+ge25519_p1p1_to_p3(ge25519_p3 *r, const ge25519_p1p1 *p);

-void ge25519_add(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
+void
+ge25519_add(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);

-void ge25519_sub(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);
+void
+ge25519_sub(ge25519_p1p1 *r, const ge25519_p3 *p, const ge25519_cached *q);

-void ge25519_scalarmult_base(ge25519_p3 *h, const unsigned char *a);
+void
+ge25519_scalarmult_base(ge25519_p3 *h, const unsigned char *a);

-void ge25519_double_scalarmult_vartime(ge25519_p2 *r, const unsigned char *a,
-                                       const ge25519_p3 *A,
-                                       const unsigned char *b);
+void
+ge25519_double_scalarmult_vartime(ge25519_p2 *r, const unsigned char *a,
+                                  const ge25519_p3 *A, const unsigned char *b);

-void ge25519_scalarmult(ge25519_p3 *h, const unsigned char *a,
-                        const ge25519_p3 *p);
+void
+ge25519_scalarmult(ge25519_p3 *h, const unsigned char *a, const ge25519_p3 *p);

-int ge25519_is_canonical(const unsigned char *s);
+int
+ge25519_is_canonical(const unsigned char *s);

-int ge25519_is_on_curve(const ge25519_p3 *p);
+int
+ge25519_is_on_curve(const ge25519_p3 *p);

-int ge25519_is_on_main_subgroup(const ge25519_p3 *p);
+int
+ge25519_is_on_main_subgroup(const ge25519_p3 *p);

-int ge25519_has_small_order(const unsigned char s[32]);
+int
+ge25519_has_small_order(const unsigned char s[32]);

-void ge25519_from_uniform(unsigned char s[32], const unsigned char r[32]);
+void
+ge25519_from_uniform(unsigned char s[32], const unsigned char r[32]);

 /*
 The set of scalars is \Z/l
 where l = 2^252 + 27742317777372353535851937790883648493.
 */

-void sc25519_reduce(unsigned char *s);
+void
+sc25519_reduce(unsigned char *s);

-void sc25519_muladd(unsigned char *s, const unsigned char *a,
-                    const unsigned char *b, const unsigned char *c);
+void
+sc25519_muladd(unsigned char *s, const unsigned char *a, const unsigned char *b,
+               const unsigned char *c);

-int sc25519_is_canonical(const unsigned char *s);
+int
+sc25519_is_canonical(const unsigned char *s);

 #endif
--- a/crypto/include/sodium/private/ed25519_ref10_fe_51.h
+++ b/crypto/include/sodium/private/ed25519_ref10_fe_51.h
@ -10,7 +10,7 @@
 static inline void
 fe25519_0(fe25519 h)
 {
-    memset(&h[0], 0, 5 * sizeof h[0]);
+  memset(&h[0], 0, 5 * sizeof h[0]);
 }

 /*
@ -20,8 +20,8 @@ fe25519_0(fe25519 h)
 static inline void
 fe25519_1(fe25519 h)
 {
-    h[0] = 1;
-    memset(&h[1], 0, 4 * sizeof h[0]);
+  h[0] = 1;
+  memset(&h[1], 0, 4 * sizeof h[0]);
 }

 /*
@ -32,17 +32,17 @@ fe25519_1(fe25519 h)
 static inline void
 fe25519_add(fe25519 h, const fe25519 f, const fe25519 g)
 {
-    uint64_t h0 = f[0] + g[0];
-    uint64_t h1 = f[1] + g[1];
-    uint64_t h2 = f[2] + g[2];
-    uint64_t h3 = f[3] + g[3];
-    uint64_t h4 = f[4] + g[4];
-
-    h[0] = h0;
-    h[1] = h1;
-    h[2] = h2;
-    h[3] = h3;
-    h[4] = h4;
+  uint64_t h0 = f[0] + g[0];
+  uint64_t h1 = f[1] + g[1];
+  uint64_t h2 = f[2] + g[2];
+  uint64_t h3 = f[3] + g[3];
+  uint64_t h4 = f[4] + g[4];
+
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
 }

 /*
@ -52,37 +52,37 @@ fe25519_add(fe25519 h, const fe25519 f, const fe25519 g)
 static void
 fe25519_sub(fe25519 h, const fe25519 f, const fe25519 g)
 {
-    const uint64_t mask = 0x7ffffffffffffULL;
-    uint64_t h0, h1, h2, h3, h4;
-
-    h0 = g[0];
-    h1 = g[1];
-    h2 = g[2];
-    h3 = g[3];
-    h4 = g[4];
-
-    h1 += h0 >> 51;
-    h0 &= mask;
-    h2 += h1 >> 51;
-    h1 &= mask;
-    h3 += h2 >> 51;
-    h2 &= mask;
-    h4 += h3 >> 51;
-    h3 &= mask;
-    h0 += 19ULL * (h4 >> 51);
-    h4 &= mask;
-
-    h0 = (f[0] + 0xfffffffffffdaULL) - h0;
-    h1 = (f[1] + 0xffffffffffffeULL) - h1;
-    h2 = (f[2] + 0xffffffffffffeULL) - h2;
-    h3 = (f[3] + 0xffffffffffffeULL) - h3;
-    h4 = (f[4] + 0xffffffffffffeULL) - h4;
-
-    h[0] = h0;
-    h[1] = h1;
-    h[2] = h2;
-    h[3] = h3;
-    h[4] = h4;
+  const uint64_t mask = 0x7ffffffffffffULL;
+  uint64_t h0, h1, h2, h3, h4;
+
+  h0 = g[0];
+  h1 = g[1];
+  h2 = g[2];
+  h3 = g[3];
+  h4 = g[4];
+
+  h1 += h0 >> 51;
+  h0 &= mask;
+  h2 += h1 >> 51;
+  h1 &= mask;
+  h3 += h2 >> 51;
+  h2 &= mask;
+  h4 += h3 >> 51;
+  h3 &= mask;
+  h0 += 19ULL * (h4 >> 51);
+  h4 &= mask;
+
+  h0 = (f[0] + 0xfffffffffffdaULL) - h0;
+  h1 = (f[1] + 0xffffffffffffeULL) - h1;
+  h2 = (f[2] + 0xffffffffffffeULL) - h2;
+  h3 = (f[3] + 0xffffffffffffeULL) - h3;
+  h4 = (f[4] + 0xffffffffffffeULL) - h4;
+
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
 }

 /*
@ -92,10 +92,10 @@ fe25519_sub(fe25519 h, const fe25519 f, const fe25519 g)
 static inline void
 fe25519_neg(fe25519 h, const fe25519 f)
 {
-    fe25519 zero;
+  fe25519 zero;

-    fe25519_0(zero);
-    fe25519_sub(h, zero, f);
+  fe25519_0(zero);
+  fe25519_sub(h, zero, f);
 }

 /*
@ -108,31 +108,31 @@ fe25519_neg(fe25519 h, const fe25519 f)
 static void
 fe25519_cmov(fe25519 f, const fe25519 g, unsigned int b)
 {
-    const uint64_t mask = (uint64_t) (-(int64_t) b);
-
-    uint64_t f0 = f[0];
-    uint64_t f1 = f[1];
-    uint64_t f2 = f[2];
-    uint64_t f3 = f[3];
-    uint64_t f4 = f[4];
-
-    uint64_t x0 = f0 ^ g[0];
-    uint64_t x1 = f1 ^ g[1];
-    uint64_t x2 = f2 ^ g[2];
-    uint64_t x3 = f3 ^ g[3];
-    uint64_t x4 = f4 ^ g[4];
-
-    x0 &= mask;
-    x1 &= mask;
-    x2 &= mask;
-    x3 &= mask;
-    x4 &= mask;
-
-    f[0] = f0 ^ x0;
-    f[1] = f1 ^ x1;
-    f[2] = f2 ^ x2;
-    f[3] = f3 ^ x3;
-    f[4] = f4 ^ x4;
+  const uint64_t mask = (uint64_t)(-(int64_t)b);
+
+  uint64_t f0 = f[0];
+  uint64_t f1 = f[1];
+  uint64_t f2 = f[2];
+  uint64_t f3 = f[3];
+  uint64_t f4 = f[4];
+
+  uint64_t x0 = f0 ^ g[0];
+  uint64_t x1 = f1 ^ g[1];
+  uint64_t x2 = f2 ^ g[2];
+  uint64_t x3 = f3 ^ g[3];
+  uint64_t x4 = f4 ^ g[4];
+
+  x0 &= mask;
+  x1 &= mask;
+  x2 &= mask;
+  x3 &= mask;
+  x4 &= mask;
+
+  f[0] = f0 ^ x0;
+  f[1] = f1 ^ x1;
+  f[2] = f2 ^ x2;
+  f[3] = f3 ^ x3;
+  f[4] = f4 ^ x4;
 }

 /*
@ -145,43 +145,43 @@ Preconditions: b in {0,1}.
 static void
 fe25519_cswap(fe25519 f, fe25519 g, unsigned int b)
 {
-    const uint64_t mask = (uint64_t) (-(int64_t) b);
-
-    uint64_t f0 = f[0];
-    uint64_t f1 = f[1];
-    uint64_t f2 = f[2];
-    uint64_t f3 = f[3];
-    uint64_t f4 = f[4];
-
-    uint64_t g0 = g[0];
-    uint64_t g1 = g[1];
-    uint64_t g2 = g[2];
-    uint64_t g3 = g[3];
-    uint64_t g4 = g[4];
-
-    uint64_t x0 = f0 ^ g0;
-    uint64_t x1 = f1 ^ g1;
-    uint64_t x2 = f2 ^ g2;
-    uint64_t x3 = f3 ^ g3;
-    uint64_t x4 = f4 ^ g4;
-
-    x0 &= mask;
-    x1 &= mask;
-    x2 &= mask;
-    x3 &= mask;
-    x4 &= mask;
-
-    f[0] = f0 ^ x0;
-    f[1] = f1 ^ x1;
-    f[2] = f2 ^ x2;
-    f[3] = f3 ^ x3;
-    f[4] = f4 ^ x4;
-
-    g[0] = g0 ^ x0;
-    g[1] = g1 ^ x1;
-    g[2] = g2 ^ x2;
-    g[3] = g3 ^ x3;
-    g[4] = g4 ^ x4;
+  const uint64_t mask = (uint64_t)(-(int64_t)b);
+
+  uint64_t f0 = f[0];
+  uint64_t f1 = f[1];
+  uint64_t f2 = f[2];
+  uint64_t f3 = f[3];
+  uint64_t f4 = f[4];
+
+  uint64_t g0 = g[0];
+  uint64_t g1 = g[1];
+  uint64_t g2 = g[2];
+  uint64_t g3 = g[3];
+  uint64_t g4 = g[4];
+
+  uint64_t x0 = f0 ^ g0;
+  uint64_t x1 = f1 ^ g1;
+  uint64_t x2 = f2 ^ g2;
+  uint64_t x3 = f3 ^ g3;
+  uint64_t x4 = f4 ^ g4;
+
+  x0 &= mask;
+  x1 &= mask;
+  x2 &= mask;
+  x3 &= mask;
+  x4 &= mask;
+
+  f[0] = f0 ^ x0;
+  f[1] = f1 ^ x1;
+  f[2] = f2 ^ x2;
+  f[3] = f3 ^ x3;
+  f[4] = f4 ^ x4;
+
+  g[0] = g0 ^ x0;
+  g[1] = g1 ^ x1;
+  g[2] = g2 ^ x2;
+  g[3] = g3 ^ x3;
+  g[4] = g4 ^ x4;
 }

 /*
@ -191,17 +191,17 @@ fe25519_cswap(fe25519 f, fe25519 g, unsigned int b)
 static inline void
 fe25519_copy(fe25519 h, const fe25519 f)
 {
-    uint64_t f0 = f[0];
-    uint64_t f1 = f[1];
-    uint64_t f2 = f[2];
-    uint64_t f3 = f[3];
-    uint64_t f4 = f[4];
-
-    h[0] = f0;
-    h[1] = f1;
-    h[2] = f2;
-    h[3] = f3;
-    h[4] = f4;
+  uint64_t f0 = f[0];
+  uint64_t f1 = f[1];
+  uint64_t f2 = f[2];
+  uint64_t f3 = f[3];
+  uint64_t f4 = f[4];
+
+  h[0] = f0;
+  h[1] = f1;
+  h[2] = f2;
+  h[3] = f3;
+  h[4] = f4;
 }

 /*
@ -212,11 +212,11 @@ fe25519_copy(fe25519 h, const fe25519 f)
 static inline int
 fe25519_isnegative(const fe25519 f)
 {
-    unsigned char s[32];
+  unsigned char s[32];

-    fe25519_tobytes(s, f);
+  fe25519_tobytes(s, f);

-    return s[0] & 1;
+  return s[0] & 1;
 }

 /*
@ -227,11 +227,11 @@ fe25519_isnegative(const fe25519 f)
 static inline int
 fe25519_iszero(const fe25519 f)
 {
-    unsigned char s[32];
+  unsigned char s[32];

-    fe25519_tobytes(s, f);
+  fe25519_tobytes(s, f);

-    return sodium_is_zero(s, 32);
+  return sodium_is_zero(s, 32);
 }

 /*
@ -242,87 +242,87 @@ fe25519_iszero(const fe25519 f)
 static void
 fe25519_mul(fe25519 h, const fe25519 f, const fe25519 g)
 {
-    const uint64_t mask = 0x7ffffffffffffULL;
-    uint128_t r0, r1, r2, r3, r4, carry;
-    uint64_t  f0, f1, f2, f3, f4;
-    uint64_t  f1_19, f2_19, f3_19, f4_19;
-    uint64_t  g0, g1, g2, g3, g4;
-    uint64_t  r00, r01, r02, r03, r04;
-
-    f0 = f[0];
-    f1 = f[1];
-    f2 = f[2];
-    f3 = f[3];
-    f4 = f[4];
-
-    g0 = g[0];
-    g1 = g[1];
-    g2 = g[2];
-    g3 = g[3];
-    g4 = g[4];
-
-    f1_19 = 19ULL * f1;
-    f2_19 = 19ULL * f2;
-    f3_19 = 19ULL * f3;
-    f4_19 = 19ULL * f4;
-
-    r0  = ((uint128_t) f0   ) * ((uint128_t) g0);
-    r0 += ((uint128_t) f1_19) * ((uint128_t) g4);
-    r0 += ((uint128_t) f2_19) * ((uint128_t) g3);
-    r0 += ((uint128_t) f3_19) * ((uint128_t) g2);
-    r0 += ((uint128_t) f4_19) * ((uint128_t) g1);
-
-    r1  = ((uint128_t) f0   ) * ((uint128_t) g1);
-    r1 += ((uint128_t) f1   ) * ((uint128_t) g0);
-    r1 += ((uint128_t) f2_19) * ((uint128_t) g4);
-    r1 += ((uint128_t) f3_19) * ((uint128_t) g3);
-    r1 += ((uint128_t) f4_19) * ((uint128_t) g2);
-
-    r2  = ((uint128_t) f0   ) * ((uint128_t) g2);
-    r2 += ((uint128_t) f1   ) * ((uint128_t) g1);
-    r2 += ((uint128_t) f2   ) * ((uint128_t) g0);
-    r2 += ((uint128_t) f3_19) * ((uint128_t) g4);
-    r2 += ((uint128_t) f4_19) * ((uint128_t) g3);
-
-    r3  = ((uint128_t) f0   ) * ((uint128_t) g3);
-    r3 += ((uint128_t) f1   ) * ((uint128_t) g2);
-    r3 += ((uint128_t) f2   ) * ((uint128_t) g1);
-    r3 += ((uint128_t) f3   ) * ((uint128_t) g0);
-    r3 += ((uint128_t) f4_19) * ((uint128_t) g4);
-
-    r4  = ((uint128_t) f0   ) * ((uint128_t) g4);
-    r4 += ((uint128_t) f1   ) * ((uint128_t) g3);
-    r4 += ((uint128_t) f2   ) * ((uint128_t) g2);
-    r4 += ((uint128_t) f3   ) * ((uint128_t) g1);
-    r4 += ((uint128_t) f4   ) * ((uint128_t) g0);
-
-    r00    = ((uint64_t) r0) & mask;
-    carry  = r0 >> 51;
-    r1    += carry;
-    r01    = ((uint64_t) r1) & mask;
-    carry  = r1 >> 51;
-    r2    += carry;
-    r02    = ((uint64_t) r2) & mask;
-    carry  = r2 >> 51;
-    r3    += carry;
-    r03    = ((uint64_t) r3) & mask;
-    carry  = r3 >> 51;
-    r4    += carry;
-    r04    = ((uint64_t) r4) & mask;
-    carry  = r4 >> 51;
-    r00   += 19ULL * (uint64_t) carry;
-    carry  = r00 >> 51;
-    r00   &= mask;
-    r01   += (uint64_t) carry;
-    carry  = r01 >> 51;
-    r01   &= mask;
-    r02   += (uint64_t) carry;
-
-    h[0] = r00;
-    h[1] = r01;
-    h[2] = r02;
-    h[3] = r03;
-    h[4] = r04;
+  const uint64_t mask = 0x7ffffffffffffULL;
+  uint128_t r0, r1, r2, r3, r4, carry;
+  uint64_t f0, f1, f2, f3, f4;
+  uint64_t f1_19, f2_19, f3_19, f4_19;
+  uint64_t g0, g1, g2, g3, g4;
+  uint64_t r00, r01, r02, r03, r04;
+
+  f0 = f[0];
+  f1 = f[1];
+  f2 = f[2];
+  f3 = f[3];
+  f4 = f[4];
+
+  g0 = g[0];
+  g1 = g[1];
+  g2 = g[2];
+  g3 = g[3];
+  g4 = g[4];
+
+  f1_19 = 19ULL * f1;
+  f2_19 = 19ULL * f2;
+  f3_19 = 19ULL * f3;
+  f4_19 = 19ULL * f4;
+
+  r0 = ((uint128_t)f0) * ((uint128_t)g0);
+  r0 += ((uint128_t)f1_19) * ((uint128_t)g4);
+  r0 += ((uint128_t)f2_19) * ((uint128_t)g3);
+  r0 += ((uint128_t)f3_19) * ((uint128_t)g2);
+  r0 += ((uint128_t)f4_19) * ((uint128_t)g1);
+
+  r1 = ((uint128_t)f0) * ((uint128_t)g1);
+  r1 += ((uint128_t)f1) * ((uint128_t)g0);
+  r1 += ((uint128_t)f2_19) * ((uint128_t)g4);
+  r1 += ((uint128_t)f3_19) * ((uint128_t)g3);
+  r1 += ((uint128_t)f4_19) * ((uint128_t)g2);
+
+  r2 = ((uint128_t)f0) * ((uint128_t)g2);
+  r2 += ((uint128_t)f1) * ((uint128_t)g1);
+  r2 += ((uint128_t)f2) * ((uint128_t)g0);
+  r2 += ((uint128_t)f3_19) * ((uint128_t)g4);
+  r2 += ((uint128_t)f4_19) * ((uint128_t)g3);
+
+  r3 = ((uint128_t)f0) * ((uint128_t)g3);
+  r3 += ((uint128_t)f1) * ((uint128_t)g2);
+  r3 += ((uint128_t)f2) * ((uint128_t)g1);
+  r3 += ((uint128_t)f3) * ((uint128_t)g0);
+  r3 += ((uint128_t)f4_19) * ((uint128_t)g4);
+
+  r4 = ((uint128_t)f0) * ((uint128_t)g4);
+  r4 += ((uint128_t)f1) * ((uint128_t)g3);
+  r4 += ((uint128_t)f2) * ((uint128_t)g2);
+  r4 += ((uint128_t)f3) * ((uint128_t)g1);
+  r4 += ((uint128_t)f4) * ((uint128_t)g0);
+
+  r00   = ((uint64_t)r0) & mask;
+  carry = r0 >> 51;
+  r1 += carry;
+  r01   = ((uint64_t)r1) & mask;
+  carry = r1 >> 51;
+  r2 += carry;
+  r02   = ((uint64_t)r2) & mask;
+  carry = r2 >> 51;
+  r3 += carry;
+  r03   = ((uint64_t)r3) & mask;
+  carry = r3 >> 51;
+  r4 += carry;
+  r04   = ((uint64_t)r4) & mask;
+  carry = r4 >> 51;
+  r00 += 19ULL * (uint64_t)carry;
+  carry = r00 >> 51;
+  r00 &= mask;
+  r01 += (uint64_t)carry;
+  carry = r01 >> 51;
+  r01 &= mask;
+  r02 += (uint64_t)carry;
+
+  h[0] = r00;
+  h[1] = r01;
+  h[2] = r02;
+  h[3] = r03;
+  h[4] = r04;
 }

 /*
@ -333,75 +333,75 @@ fe25519_mul(fe25519 h, const fe25519 f, const fe25519 g)
 static void
 fe25519_sq(fe25519 h, const fe25519 f)
 {
-    const uint64_t mask = 0x7ffffffffffffULL;
-    uint128_t r0, r1, r2, r3, r4, carry;
-    uint64_t  f0, f1, f2, f3, f4;
-    uint64_t  f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
-    uint64_t  r00, r01, r02, r03, r04;
-
-    f0 = f[0];
-    f1 = f[1];
-    f2 = f[2];
-    f3 = f[3];
-    f4 = f[4];
-
-    f0_2 = f0 << 1;
-    f1_2 = f1 << 1;
-
-    f1_38 = 38ULL * f1;
-    f2_38 = 38ULL * f2;
-    f3_38 = 38ULL * f3;
-
-    f3_19 = 19ULL * f3;
-    f4_19 = 19ULL * f4;
-
-    r0  = ((uint128_t) f0   ) * ((uint128_t) f0);
-    r0 += ((uint128_t) f1_38) * ((uint128_t) f4);
-    r0 += ((uint128_t) f2_38) * ((uint128_t) f3);
-
-    r1  = ((uint128_t) f0_2 ) * ((uint128_t) f1);
-    r1 += ((uint128_t) f2_38) * ((uint128_t) f4);
-    r1 += ((uint128_t) f3_19) * ((uint128_t) f3);
-
-    r2  = ((uint128_t) f0_2 ) * ((uint128_t) f2);
-    r2 += ((uint128_t) f1   ) * ((uint128_t) f1);
-    r2 += ((uint128_t) f3_38) * ((uint128_t) f4);
-
-    r3  = ((uint128_t) f0_2 ) * ((uint128_t) f3);
-    r3 += ((uint128_t) f1_2 ) * ((uint128_t) f2);
-    r3 += ((uint128_t) f4_19) * ((uint128_t) f4);
-
-    r4  = ((uint128_t) f0_2 ) * ((uint128_t) f4);
-    r4 += ((uint128_t) f1_2 ) * ((uint128_t) f3);
-    r4 += ((uint128_t) f2   ) * ((uint128_t) f2);
-
-    r00    = ((uint64_t) r0) & mask;
-    carry  = r0 >> 51;
-    r1    += carry;
-    r01    = ((uint64_t) r1) & mask;
-    carry  = r1 >> 51;
-    r2    += carry;
-    r02    = ((uint64_t) r2) & mask;
-    carry  = r2 >> 51;
-    r3    += carry;
-    r03    = ((uint64_t) r3) & mask;
-    carry  = r3 >> 51;
-    r4    += carry;
-    r04    = ((uint64_t) r4) & mask;
-    carry  = r4 >> 51;
-    r00   += 19ULL * (uint64_t) carry;
-    carry  = r00 >> 51;
-    r00   &= mask;
-    r01   += (uint64_t) carry;
-    carry  = r01 >> 51;
-    r01   &= mask;
-    r02   += (uint64_t) carry;
-
-    h[0] = r00;
-    h[1] = r01;
-    h[2] = r02;
-    h[3] = r03;
-    h[4] = r04;
+  const uint64_t mask = 0x7ffffffffffffULL;
+  uint128_t r0, r1, r2, r3, r4, carry;
+  uint64_t f0, f1, f2, f3, f4;
+  uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
+  uint64_t r00, r01, r02, r03, r04;
+
+  f0 = f[0];
+  f1 = f[1];
+  f2 = f[2];
+  f3 = f[3];
+  f4 = f[4];
+
+  f0_2 = f0 << 1;
+  f1_2 = f1 << 1;
+
+  f1_38 = 38ULL * f1;
+  f2_38 = 38ULL * f2;
+  f3_38 = 38ULL * f3;
+
+  f3_19 = 19ULL * f3;
+  f4_19 = 19ULL * f4;
+
+  r0 = ((uint128_t)f0) * ((uint128_t)f0);
+  r0 += ((uint128_t)f1_38) * ((uint128_t)f4);
+  r0 += ((uint128_t)f2_38) * ((uint128_t)f3);
+
+  r1 = ((uint128_t)f0_2) * ((uint128_t)f1);
+  r1 += ((uint128_t)f2_38) * ((uint128_t)f4);
+  r1 += ((uint128_t)f3_19) * ((uint128_t)f3);
+
+  r2 = ((uint128_t)f0_2) * ((uint128_t)f2);
+  r2 += ((uint128_t)f1) * ((uint128_t)f1);
+  r2 += ((uint128_t)f3_38) * ((uint128_t)f4);
+
+  r3 = ((uint128_t)f0_2) * ((uint128_t)f3);
+  r3 += ((uint128_t)f1_2) * ((uint128_t)f2);
+  r3 += ((uint128_t)f4_19) * ((uint128_t)f4);
+
+  r4 = ((uint128_t)f0_2) * ((uint128_t)f4);
+  r4 += ((uint128_t)f1_2) * ((uint128_t)f3);
+  r4 += ((uint128_t)f2) * ((uint128_t)f2);
+
+  r00   = ((uint64_t)r0) & mask;
+  carry = r0 >> 51;
+  r1 += carry;
+  r01   = ((uint64_t)r1) & mask;
+  carry = r1 >> 51;
+  r2 += carry;
+  r02   = ((uint64_t)r2) & mask;
+  carry = r2 >> 51;
+  r3 += carry;
+  r03   = ((uint64_t)r3) & mask;
+  carry = r3 >> 51;
+  r4 += carry;
+  r04   = ((uint64_t)r4) & mask;
+  carry = r4 >> 51;
+  r00 += 19ULL * (uint64_t)carry;
+  carry = r00 >> 51;
+  r00 &= mask;
+  r01 += (uint64_t)carry;
+  carry = r01 >> 51;
+  r01 &= mask;
+  r02 += (uint64_t)carry;
+
+  h[0] = r00;
+  h[1] = r01;
+  h[2] = r02;
+  h[3] = r03;
+  h[4] = r04;
 }

 /*
@ -412,107 +412,107 @@ fe25519_sq(fe25519 h, const fe25519 f)
 static void
 fe25519_sq2(fe25519 h, const fe25519 f)
 {
-    const uint64_t mask = 0x7ffffffffffffULL;
-    uint128_t r0, r1, r2, r3, r4, carry;
-    uint64_t  f0, f1, f2, f3, f4;
-    uint64_t  f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
-    uint64_t  r00, r01, r02, r03, r04;
-
-    f0 = f[0];
-    f1 = f[1];
-    f2 = f[2];
-    f3 = f[3];
-    f4 = f[4];
-
-    f0_2 = f0 << 1;
-    f1_2 = f1 << 1;
-
-    f1_38 = 38ULL * f1;
-    f2_38 = 38ULL * f2;
-    f3_38 = 38ULL * f3;
-
-    f3_19 = 19ULL * f3;
-    f4_19 = 19ULL * f4;
-
-    r0  = ((uint128_t) f0   ) * ((uint128_t) f0);
-    r0 += ((uint128_t) f1_38) * ((uint128_t) f4);
-    r0 += ((uint128_t) f2_38) * ((uint128_t) f3);
-
-    r1  = ((uint128_t) f0_2 ) * ((uint128_t) f1);
-    r1 += ((uint128_t) f2_38) * ((uint128_t) f4);
-    r1 += ((uint128_t) f3_19) * ((uint128_t) f3);
-
-    r2  = ((uint128_t) f0_2 ) * ((uint128_t) f2);
-    r2 += ((uint128_t) f1   ) * ((uint128_t) f1);
-    r2 += ((uint128_t) f3_38) * ((uint128_t) f4);
-
-    r3  = ((uint128_t) f0_2 ) * ((uint128_t) f3);
-    r3 += ((uint128_t) f1_2 ) * ((uint128_t) f2);
-    r3 += ((uint128_t) f4_19) * ((uint128_t) f4);
-
-    r4  = ((uint128_t) f0_2 ) * ((uint128_t) f4);
-    r4 += ((uint128_t) f1_2 ) * ((uint128_t) f3);
-    r4 += ((uint128_t) f2   ) * ((uint128_t) f2);
-
-    r0 <<= 1;
-    r1 <<= 1;
-    r2 <<= 1;
-    r3 <<= 1;
-    r4 <<= 1;
-
-    r00    = ((uint64_t) r0) & mask;
-    carry  = r0 >> 51;
-    r1    += carry;
-    r01    = ((uint64_t) r1) & mask;
-    carry  = r1 >> 51;
-    r2    += carry;
-    r02    = ((uint64_t) r2) & mask;
-    carry  = r2 >> 51;
-    r3    += carry;
-    r03    = ((uint64_t) r3) & mask;
-    carry  = r3 >> 51;
-    r4    += carry;
-    r04    = ((uint64_t) r4) & mask;
-    carry  = r4 >> 51;
-    r00   += 19ULL * (uint64_t) carry;
-    carry  = r00 >> 51;
-    r00   &= mask;
-    r01   += (uint64_t) carry;
-    carry  = r01 >> 51;
-    r01   &= mask;
-    r02   += (uint64_t) carry;
-
-    h[0] = r00;
-    h[1] = r01;
-    h[2] = r02;
-    h[3] = r03;
-    h[4] = r04;
+  const uint64_t mask = 0x7ffffffffffffULL;
+  uint128_t r0, r1, r2, r3, r4, carry;
+  uint64_t f0, f1, f2, f3, f4;
+  uint64_t f0_2, f1_2, f1_38, f2_38, f3_38, f3_19, f4_19;
+  uint64_t r00, r01, r02, r03, r04;
+
+  f0 = f[0];
+  f1 = f[1];
+  f2 = f[2];
+  f3 = f[3];
+  f4 = f[4];
+
+  f0_2 = f0 << 1;
+  f1_2 = f1 << 1;
+
+  f1_38 = 38ULL * f1;
+  f2_38 = 38ULL * f2;
+  f3_38 = 38ULL * f3;
+
+  f3_19 = 19ULL * f3;
+  f4_19 = 19ULL * f4;
+
+  r0 = ((uint128_t)f0) * ((uint128_t)f0);
+  r0 += ((uint128_t)f1_38) * ((uint128_t)f4);
+  r0 += ((uint128_t)f2_38) * ((uint128_t)f3);
+
+  r1 = ((uint128_t)f0_2) * ((uint128_t)f1);
+  r1 += ((uint128_t)f2_38) * ((uint128_t)f4);
+  r1 += ((uint128_t)f3_19) * ((uint128_t)f3);
+
+  r2 = ((uint128_t)f0_2) * ((uint128_t)f2);
+  r2 += ((uint128_t)f1) * ((uint128_t)f1);
+  r2 += ((uint128_t)f3_38) * ((uint128_t)f4);
+
+  r3 = ((uint128_t)f0_2) * ((uint128_t)f3);
+  r3 += ((uint128_t)f1_2) * ((uint128_t)f2);
+  r3 += ((uint128_t)f4_19) * ((uint128_t)f4);
+
+  r4 = ((uint128_t)f0_2) * ((uint128_t)f4);
+  r4 += ((uint128_t)f1_2) * ((uint128_t)f3);
+  r4 += ((uint128_t)f2) * ((uint128_t)f2);
+
+  r0 <<= 1;
+  r1 <<= 1;
+  r2 <<= 1;
+  r3 <<= 1;
+  r4 <<= 1;
+
+  r00   = ((uint64_t)r0) & mask;
+  carry = r0 >> 51;
+  r1 += carry;
+  r01   = ((uint64_t)r1) & mask;
+  carry = r1 >> 51;
+  r2 += carry;
+  r02   = ((uint64_t)r2) & mask;
+  carry = r2 >> 51;
+  r3 += carry;
+  r03   = ((uint64_t)r3) & mask;
+  carry = r3 >> 51;
+  r4 += carry;
+  r04   = ((uint64_t)r4) & mask;
+  carry = r4 >> 51;
+  r00 += 19ULL * (uint64_t)carry;
+  carry = r00 >> 51;
+  r00 &= mask;
+  r01 += (uint64_t)carry;
+  carry = r01 >> 51;
+  r01 &= mask;
+  r02 += (uint64_t)carry;
+
+  h[0] = r00;
+  h[1] = r01;
+  h[2] = r02;
+  h[3] = r03;
+  h[4] = r04;
 }

 static void
 fe25519_scalar_product(fe25519 h, const fe25519 f, uint32_t n)
 {
-    const uint64_t mask = 0x7ffffffffffffULL;
-    uint128_t a;
-    uint128_t sn = (uint128_t) n;
-    uint64_t  h0, h1, h2, h3, h4;
-
-    a  = f[0] * sn;
-    h0 = ((uint64_t) a) & mask;
-    a  = f[1] * sn + ((uint64_t) (a >> 51));
-    h1 = ((uint64_t) a) & mask;
-    a  = f[2] * sn + ((uint64_t) (a >> 51));
-    h2 = ((uint64_t) a) & mask;
-    a  = f[3] * sn + ((uint64_t) (a >> 51));
-    h3 = ((uint64_t) a) & mask;
-    a  = f[4] * sn + ((uint64_t) (a >> 51));
-    h4 = ((uint64_t) a) & mask;
-
-    h0 += (a >> 51) * 19ULL;
-
-    h[0] = h0;
-    h[1] = h1;
-    h[2] = h2;
-    h[3] = h3;
-    h[4] = h4;
+  const uint64_t mask = 0x7ffffffffffffULL;
+  uint128_t a;
+  uint128_t sn = (uint128_t)n;
+  uint64_t h0, h1, h2, h3, h4;
+
+  a  = f[0] * sn;
+  h0 = ((uint64_t)a) & mask;
+  a  = f[1] * sn + ((uint64_t)(a >> 51));
+  h1 = ((uint64_t)a) & mask;
+  a  = f[2] * sn + ((uint64_t)(a >> 51));
+  h2 = ((uint64_t)a) & mask;
+  a  = f[3] * sn + ((uint64_t)(a >> 51));
+  h3 = ((uint64_t)a) & mask;
+  a  = f[4] * sn + ((uint64_t)(a >> 51));
+  h4 = ((uint64_t)a) & mask;
+
+  h0 += (a >> 51) * 19ULL;
+
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
 }
--- a/crypto/include/sodium/private/implementations.h
+++ b/crypto/include/sodium/private/implementations.h
@ -1,11 +1,17 @@
 #ifndef implementations_H
 #define implementations_H

-int _crypto_generichash_blake2b_pick_best_implementation(void);
-int _crypto_onetimeauth_poly1305_pick_best_implementation(void);
-int _crypto_pwhash_argon2_pick_best_implementation(void);
-int _crypto_scalarmult_curve25519_pick_best_implementation(void);
-int _crypto_stream_chacha20_pick_best_implementation(void);
-int _crypto_stream_salsa20_pick_best_implementation(void);
+int
+_crypto_generichash_blake2b_pick_best_implementation(void);
+int
+_crypto_onetimeauth_poly1305_pick_best_implementation(void);
+int
+_crypto_pwhash_argon2_pick_best_implementation(void);
+int
+_crypto_scalarmult_curve25519_pick_best_implementation(void);
+int
+_crypto_stream_chacha20_pick_best_implementation(void);
+int
+_crypto_stream_salsa20_pick_best_implementation(void);

 #endif
--- a/crypto/include/sodium/private/mutex.h
+++ b/crypto/include/sodium/private/mutex.h
@ -1,7 +1,9 @@
 #ifndef mutex_H
 #define mutex_H 1

-extern int sodium_crit_enter(void);
-extern int sodium_crit_leave(void);
+extern int
+sodium_crit_enter(void);
+extern int
+sodium_crit_leave(void);

 #endif
--- a/crypto/include/sodium/private/sse2_64_32.h
+++ b/crypto/include/sodium/private/sse2_64_32.h
@ -4,46 +4,53 @@
 #include "common.h"

 #ifdef HAVE_INTRIN_H
-# include <intrin.h>
+#include <intrin.h>
 #endif

-#if defined(HAVE_EMMINTRIN_H) && \
-    !(defined(__amd64) || defined(__amd64__) || defined(__x86_64__) || \
-      defined(_M_X64) || defined(_M_AMD64))
+#if defined(HAVE_EMMINTRIN_H)                                          \
+    && !(defined(__amd64) || defined(__amd64__) || defined(__x86_64__) \
+         || defined(_M_X64) || defined(_M_AMD64))

-# include <emmintrin.h>
-# include <stdint.h>
+#include <emmintrin.h>
+#include <stdint.h>

-# ifndef _mm_set_epi64x
-#  define _mm_set_epi64x(Q0, Q1) sodium__mm_set_epi64x((Q0), (Q1))
+#ifndef _mm_set_epi64x
+#define _mm_set_epi64x(Q0, Q1) sodium__mm_set_epi64x((Q0), (Q1))
 static inline __m128i
 sodium__mm_set_epi64x(int64_t q1, int64_t q0)
 {
-    union { int64_t as64; int32_t as32[2]; } x0, x1;
-    x0.as64 = q0; x1.as64 = q1;
-    return _mm_set_epi32(x1.as32[1], x1.as32[0], x0.as32[1], x0.as32[0]);
+  union {
+    int64_t as64;
+    int32_t as32[2];
+  } x0, x1;
+  x0.as64 = q0;
+  x1.as64 = q1;
+  return _mm_set_epi32(x1.as32[1], x1.as32[0], x0.as32[1], x0.as32[0]);
 }
-# endif
+#endif

-# ifndef _mm_set1_epi64x
-#  define _mm_set1_epi64x(Q) sodium__mm_set1_epi64x(Q)
+#ifndef _mm_set1_epi64x
+#define _mm_set1_epi64x(Q) sodium__mm_set1_epi64x(Q)
 static inline __m128i
 sodium__mm_set1_epi64x(int64_t q)
 {
-    return _mm_set_epi64x(q, q);
+  return _mm_set_epi64x(q, q);
 }
-# endif
+#endif

-# ifndef _mm_cvtsi64_si128
-#  define _mm_cvtsi64_si128(Q) sodium__mm_cvtsi64_si128(Q)
+#ifndef _mm_cvtsi64_si128
+#define _mm_cvtsi64_si128(Q) sodium__mm_cvtsi64_si128(Q)
 static inline __m128i
 sodium__mm_cvtsi64_si128(int64_t q)
 {
-    union { int64_t as64; int32_t as32[2]; } x;
-    x.as64 = q;
-    return _mm_setr_epi32(x.as32[0], x.as32[1], 0, 0);
+  union {
+    int64_t as64;
+    int32_t as32[2];
+  } x;
+  x.as64 = q;
+  return _mm_setr_epi32(x.as32[0], x.as32[1], 0, 0);
 }
-# endif
+#endif

 #endif

--- a/crypto/include/sodium/randombytes_nativeclient.h
+++ b/crypto/include/sodium/randombytes_nativeclient.h
@ -4,19 +4,21 @@

 #ifdef __native_client__

-# include "export.h"
-# include "randombytes.h"
+#include "export.h"
+#include "randombytes.h"

-# ifdef __cplusplus
-extern "C" {
-# endif
+#ifdef __cplusplus
+extern "C"
+{
+#endif

-SODIUM_EXPORT
-extern struct randombytes_implementation randombytes_nativeclient_implementation;
+  SODIUM_EXPORT
+  extern struct randombytes_implementation
+      randombytes_nativeclient_implementation;

-# ifdef __cplusplus
+#ifdef __cplusplus
 }
-# endif
+#endif

 #endif

--- a/crypto/include/sodium/runtime.h
+++ b/crypto/include/sodium/runtime.h
@ -5,45 +5,59 @@
 #include "export.h"

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_neon(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_neon(void);

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_sse2(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_sse2(void);

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_sse3(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_sse3(void);

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_ssse3(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_ssse3(void);

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_sse41(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_sse41(void);

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_avx(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_avx(void);

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_avx2(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_avx2(void);

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_avx512f(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_avx512f(void);

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_pclmul(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_pclmul(void);

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_aesni(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_aesni(void);

-SODIUM_EXPORT_WEAK
-int sodium_runtime_has_rdrand(void);
+  SODIUM_EXPORT_WEAK
+  int
+  sodium_runtime_has_rdrand(void);

-/* ------------------------------------------------------------------------- */
+  /* -------------------------------------------------------------------------
+   */

-int _sodium_runtime_get_cpu_features(void);
+  int
+  _sodium_runtime_get_cpu_features(void);

 #ifdef __cplusplus
 }
--- a/crypto/include/sodium/utils.h
+++ b/crypto/include/sodium/utils.h
@ -7,161 +7,188 @@
 #include "export.h"

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

 #ifndef SODIUM_C99
-# if defined(__cplusplus) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L
-#  define SODIUM_C99(X)
-# else
-#  define SODIUM_C99(X) X
-# endif
+#if defined(__cplusplus) || !defined(__STDC_VERSION__) \
+    || __STDC_VERSION__ < 199901L
+#define SODIUM_C99(X)
+#else
+#define SODIUM_C99(X) X
+#endif
 #endif

-SODIUM_EXPORT
-void sodium_memzero(void * const pnt, const size_t len);
-
-SODIUM_EXPORT
-void sodium_stackzero(const size_t len);
-
-/*
- * WARNING: sodium_memcmp() must be used to verify if two secret keys
- * are equal, in constant time.
- * It returns 0 if the keys are equal, and -1 if they differ.
- * This function is not designed for lexicographical comparisons.
- */
-SODIUM_EXPORT
-int sodium_memcmp(const void * const b1_, const void * const b2_, size_t len)
-            __attribute__ ((warn_unused_result));
-
-/*
- * sodium_compare() returns -1 if b1_ < b2_, 1 if b1_ > b2_ and 0 if b1_ == b2_
- * It is suitable for lexicographical comparisons, or to compare nonces
- * and counters stored in little-endian format.
- * However, it is slower than sodium_memcmp().
- */
-SODIUM_EXPORT
-int sodium_compare(const unsigned char *b1_, const unsigned char *b2_,
-                   size_t len)
-            __attribute__ ((warn_unused_result));
-
-SODIUM_EXPORT
-int sodium_is_zero(const unsigned char *n, const size_t nlen);
-
-SODIUM_EXPORT
-void sodium_increment(unsigned char *n, const size_t nlen);
-
-SODIUM_EXPORT
-void sodium_add(unsigned char *a, const unsigned char *b, const size_t len);
-
-SODIUM_EXPORT
-char *sodium_bin2hex(char * const hex, const size_t hex_maxlen,
-                     const unsigned char * const bin, const size_t bin_len);
-
-SODIUM_EXPORT
-int sodium_hex2bin(unsigned char * const bin, const size_t bin_maxlen,
-                   const char * const hex, const size_t hex_len,
-                   const char * const ignore, size_t * const bin_len,
-                   const char ** const hex_end);
-
-#define sodium_base64_VARIANT_ORIGINAL            1
+  SODIUM_EXPORT
+  void
+  sodium_memzero(void *const pnt, const size_t len);
+
+  SODIUM_EXPORT
+  void
+  sodium_stackzero(const size_t len);
+
+  /*
+   * WARNING: sodium_memcmp() must be used to verify if two secret keys
+   * are equal, in constant time.
+   * It returns 0 if the keys are equal, and -1 if they differ.
+   * This function is not designed for lexicographical comparisons.
+   */
+  SODIUM_EXPORT
+  int
+  sodium_memcmp(const void *const b1_, const void *const b2_, size_t len)
+      __attribute__((warn_unused_result));
+
+  /*
+   * sodium_compare() returns -1 if b1_ < b2_, 1 if b1_ > b2_ and 0 if b1_ ==
+   * b2_ It is suitable for lexicographical comparisons, or to compare nonces
+   * and counters stored in little-endian format.
+   * However, it is slower than sodium_memcmp().
+   */
+  SODIUM_EXPORT
+  int
+  sodium_compare(const unsigned char *b1_, const unsigned char *b2_, size_t len)
+      __attribute__((warn_unused_result));
+
+  SODIUM_EXPORT
+  int
+  sodium_is_zero(const unsigned char *n, const size_t nlen);
+
+  SODIUM_EXPORT
+  void
+  sodium_increment(unsigned char *n, const size_t nlen);
+
+  SODIUM_EXPORT
+  void
+  sodium_add(unsigned char *a, const unsigned char *b, const size_t len);
+
+  SODIUM_EXPORT
+  char *
+  sodium_bin2hex(char *const hex, const size_t hex_maxlen,
+                 const unsigned char *const bin, const size_t bin_len);
+
+  SODIUM_EXPORT
+  int
+  sodium_hex2bin(unsigned char *const bin, const size_t bin_maxlen,
+                 const char *const hex, const size_t hex_len,
+                 const char *const ignore, size_t *const bin_len,
+                 const char **const hex_end);
+
+#define sodium_base64_VARIANT_ORIGINAL 1
 #define sodium_base64_VARIANT_ORIGINAL_NO_PADDING 3
-#define sodium_base64_VARIANT_URLSAFE             5
-#define sodium_base64_VARIANT_URLSAFE_NO_PADDING  7
+#define sodium_base64_VARIANT_URLSAFE 5
+#define sodium_base64_VARIANT_URLSAFE_NO_PADDING 7

 /*
 * Computes the required length to encode BIN_LEN bytes as a base64 string
 * using the given variant. The computed length includes a trailing \0.
 */
-#define sodium_base64_ENCODED_LEN(BIN_LEN, VARIANT) \
-    (((BIN_LEN) / 3U) * 4U + \
-    ((((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) | (((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) >> 1)) & 1U) * \
-     (4U - (~((((VARIANT) & 2U) >> 1) - 1U) & (3U - ((BIN_LEN) - ((BIN_LEN) / 3U) * 3U)))) + 1U)
-
-SODIUM_EXPORT
-size_t sodium_base64_encoded_len(const size_t bin_len, const int variant);
-
-SODIUM_EXPORT
-char *sodium_bin2base64(char * const b64, const size_t b64_maxlen,
-                        const unsigned char * const bin, const size_t bin_len,
-                        const int variant);
-
-SODIUM_EXPORT
-int sodium_base642bin(unsigned char * const bin, const size_t bin_maxlen,
-                      const char * const b64, const size_t b64_len,
-                      const char * const ignore, size_t * const bin_len,
-                      const char ** const b64_end, const int variant);
-
-SODIUM_EXPORT
-int sodium_mlock(void * const addr, const size_t len);
-
-SODIUM_EXPORT
-int sodium_munlock(void * const addr, const size_t len);
-
-/* WARNING: sodium_malloc() and sodium_allocarray() are not general-purpose
- * allocation functions.
- *
- * They return a pointer to a region filled with 0xd0 bytes, immediately
- * followed by a guard page.
- * As a result, accessing a single byte after the requested allocation size
- * will intentionally trigger a segmentation fault.
- *
- * A canary and an additional guard page placed before the beginning of the
- * region may also kill the process if a buffer underflow is detected.
- *
- * The memory layout is:
- * [unprotected region size (read only)][guard page (no access)][unprotected pages (read/write)][guard page (no access)]
- * With the layout of the unprotected pages being:
- * [optional padding][16-bytes canary][user region]
- *
- * However:
- * - These functions are significantly slower than standard functions
- * - Each allocation requires 3 or 4 additional pages
- * - The returned address will not be aligned if the allocation size is not
- *   a multiple of the required alignment. For this reason, these functions
- *   are designed to store data, such as secret keys and messages.
- *
- * sodium_malloc() can be used to allocate any libsodium data structure.
- *
- * The crypto_generichash_state structure is packed and its length is
- * either 357 or 361 bytes. For this reason, when using sodium_malloc() to
- * allocate a crypto_generichash_state structure, padding must be added in
- * order to ensure proper alignment. crypto_generichash_statebytes()
- * returns the rounded up structure size, and should be prefered to sizeof():
- * state = sodium_malloc(crypto_generichash_statebytes());
- */
-
-SODIUM_EXPORT
-void *sodium_malloc(const size_t size)
-            __attribute__ ((malloc));
-
-SODIUM_EXPORT
-void *sodium_allocarray(size_t count, size_t size)
-            __attribute__ ((malloc));
-
-SODIUM_EXPORT
-void sodium_free(void *ptr);
-
-SODIUM_EXPORT
-int sodium_mprotect_noaccess(void *ptr);
-
-SODIUM_EXPORT
-int sodium_mprotect_readonly(void *ptr);
-
-SODIUM_EXPORT
-int sodium_mprotect_readwrite(void *ptr);
-
-SODIUM_EXPORT
-int sodium_pad(size_t *padded_buflen_p, unsigned char *buf,
-               size_t unpadded_buflen, size_t blocksize, size_t max_buflen);
-
-SODIUM_EXPORT
-int sodium_unpad(size_t *unpadded_buflen_p, const unsigned char *buf,
-                 size_t padded_buflen, size_t blocksize);
-
-/* -------- */
-
-int _sodium_alloc_init(void);
+#define sodium_base64_ENCODED_LEN(BIN_LEN, VARIANT)         \
+  (((BIN_LEN) / 3U) * 4U                                    \
+   + ((((BIN_LEN) - ((BIN_LEN) / 3U) * 3U)                  \
+       | (((BIN_LEN) - ((BIN_LEN) / 3U) * 3U) >> 1))        \
+      & 1U)                                                 \
+       * (4U                                                \
+          - (~((((VARIANT)&2U) >> 1) - 1U)                  \
+             & (3U - ((BIN_LEN) - ((BIN_LEN) / 3U) * 3U)))) \
+   + 1U)
+
+  SODIUM_EXPORT
+  size_t
+  sodium_base64_encoded_len(const size_t bin_len, const int variant);
+
+  SODIUM_EXPORT
+  char *
+  sodium_bin2base64(char *const b64, const size_t b64_maxlen,
+                    const unsigned char *const bin, const size_t bin_len,
+                    const int variant);
+
+  SODIUM_EXPORT
+  int
+  sodium_base642bin(unsigned char *const bin, const size_t bin_maxlen,
+                    const char *const b64, const size_t b64_len,
+                    const char *const ignore, size_t *const bin_len,
+                    const char **const b64_end, const int variant);
+
+  SODIUM_EXPORT
+  int
+  sodium_mlock(void *const addr, const size_t len);
+
+  SODIUM_EXPORT
+  int
+  sodium_munlock(void *const addr, const size_t len);
+
+  /* WARNING: sodium_malloc() and sodium_allocarray() are not general-purpose
+   * allocation functions.
+   *
+   * They return a pointer to a region filled with 0xd0 bytes, immediately
+   * followed by a guard page.
+   * As a result, accessing a single byte after the requested allocation size
+   * will intentionally trigger a segmentation fault.
+   *
+   * A canary and an additional guard page placed before the beginning of the
+   * region may also kill the process if a buffer underflow is detected.
+   *
+   * The memory layout is:
+   * [unprotected region size (read only)][guard page (no access)][unprotected
+   * pages (read/write)][guard page (no access)] With the layout of the
+   * unprotected pages being: [optional padding][16-bytes canary][user region]
+   *
+   * However:
+   * - These functions are significantly slower than standard functions
+   * - Each allocation requires 3 or 4 additional pages
+   * - The returned address will not be aligned if the allocation size is not
+   *   a multiple of the required alignment. For this reason, these functions
+   *   are designed to store data, such as secret keys and messages.
+   *
+   * sodium_malloc() can be used to allocate any libsodium data structure.
+   *
+   * The crypto_generichash_state structure is packed and its length is
+   * either 357 or 361 bytes. For this reason, when using sodium_malloc() to
+   * allocate a crypto_generichash_state structure, padding must be added in
+   * order to ensure proper alignment. crypto_generichash_statebytes()
+   * returns the rounded up structure size, and should be prefered to sizeof():
+   * state = sodium_malloc(crypto_generichash_statebytes());
+   */
+
+  SODIUM_EXPORT
+  void *
+  sodium_malloc(const size_t size) __attribute__((malloc));
+
+  SODIUM_EXPORT
+  void *
+  sodium_allocarray(size_t count, size_t size) __attribute__((malloc));
+
+  SODIUM_EXPORT
+  void
+  sodium_free(void *ptr);
+
+  SODIUM_EXPORT
+  int
+  sodium_mprotect_noaccess(void *ptr);
+
+  SODIUM_EXPORT
+  int
+  sodium_mprotect_readonly(void *ptr);
+
+  SODIUM_EXPORT
+  int
+  sodium_mprotect_readwrite(void *ptr);
+
+  SODIUM_EXPORT
+  int
+  sodium_pad(size_t *padded_buflen_p, unsigned char *buf,
+             size_t unpadded_buflen, size_t blocksize, size_t max_buflen);
+
+  SODIUM_EXPORT
+  int
+  sodium_unpad(size_t *unpadded_buflen_p, const unsigned char *buf,
+               size_t padded_buflen, size_t blocksize);
+
+  /* -------- */
+
+  int
+  _sodium_alloc_init(void);

 #ifdef __cplusplus
 }
--- a/crypto/libntrup/src/avx/int32_sort.c
+++ b/crypto/libntrup/src/avx/int32_sort.c
@ -4,424 +4,463 @@

 typedef crypto_int32 int32;

-static inline void minmax(int32 *x,int32 *y)
+static inline void
+minmax(int32 *x, int32 *y)
 {
-  asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg %%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
-    : : "r"(x),"r"(y) : "%eax","%ebx","%edx");
+  asm("movl (%0),%%eax;movl (%1),%%ebx;cmpl %%ebx,%%eax;mov %%eax,%%edx;cmovg "
+      "%%ebx,%%eax;cmovg %%edx,%%ebx;movl %%eax,(%0);movl %%ebx,(%1)"
+      :
+      : "r"(x), "r"(y)
+      : "%eax", "%ebx", "%edx");
 }

 /* sort x0,x2; sort x1,x3; ... sort x13, x15 */
-static inline void minmax02through1315(int32 *x)
+static inline void
+minmax02through1315(int32 *x)
 {
-  __m256i a = _mm256_loadu_si256((__m256i *) x);
-  __m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
-  __m256i c = _mm256_unpacklo_epi64(a,b); /* a01b01a45b45 */
-  __m256i d = _mm256_unpackhi_epi64(a,b); /* a23b23a67b67 */
-  __m256i g = _mm256_min_epi32(c,d);
-  __m256i h = _mm256_max_epi32(c,d);
-  a = _mm256_unpacklo_epi64(g,h);
-  b = _mm256_unpackhi_epi64(g,h);
-  _mm256_storeu_si256((__m256i *) x,a);
-  _mm256_storeu_si256((__m256i *) (x + 8),b);
+  __m256i a = _mm256_loadu_si256((__m256i *)x);
+  __m256i b = _mm256_loadu_si256((__m256i *)(x + 8));
+  __m256i c = _mm256_unpacklo_epi64(a, b); /* a01b01a45b45 */
+  __m256i d = _mm256_unpackhi_epi64(a, b); /* a23b23a67b67 */
+  __m256i g = _mm256_min_epi32(c, d);
+  __m256i h = _mm256_max_epi32(c, d);
+  a         = _mm256_unpacklo_epi64(g, h);
+  b         = _mm256_unpackhi_epi64(g, h);
+  _mm256_storeu_si256((__m256i *)x, a);
+  _mm256_storeu_si256((__m256i *)(x + 8), b);
 }

 /* sort x0,x2; sort x1,x3; sort x4,x6; sort x5,x7 */
-static inline void minmax02134657(int32 *x)
+static inline void
+minmax02134657(int32 *x)
 {
-  __m256i a = _mm256_loadu_si256((__m256i *) x);
-  __m256i b = _mm256_shuffle_epi32(a,0x4e);
-  __m256i c = _mm256_cmpgt_epi32(a,b);
-  c = _mm256_shuffle_epi32(c,0x44);
+  __m256i a   = _mm256_loadu_si256((__m256i *)x);
+  __m256i b   = _mm256_shuffle_epi32(a, 0x4e);
+  __m256i c   = _mm256_cmpgt_epi32(a, b);
+  c           = _mm256_shuffle_epi32(c, 0x44);
  __m256i abc = c & (a ^ b);
  a ^= abc;
-  _mm256_storeu_si256((__m256i *) x,a);
+  _mm256_storeu_si256((__m256i *)x, a);
 }

-static void multiminmax2plus2(
-  int32 *x,
-  int n)
+static void
+multiminmax2plus2(int32 *x, int n)
 {
-  while (n >= 16) {
+  while(n >= 16)
+  {
    minmax02through1315(x);
    n -= 16;
    x += 16;
  }
-  if (n >= 8) {
+  if(n >= 8)
+  {
    minmax02134657(x);
    n -= 8;
    x += 8;
  }
-  if (n >= 4) {
-    minmax(x,x + 2);
-    minmax(x + 1,x + 3);
+  if(n >= 4)
+  {
+    minmax(x, x + 2);
+    minmax(x + 1, x + 3);
    n -= 4;
    x += 4;
  }
-  if (n > 0) {
-    minmax(x,x + 2);
-    if (n > 1) minmax(x + 1,x + 3);
+  if(n > 0)
+  {
+    minmax(x, x + 2);
+    if(n > 1)
+      minmax(x + 1, x + 3);
  }
 }

-static void multiminmax2plus6(
-  int32 *x,
-  int n)
+static void
+multiminmax2plus6(int32 *x, int n)
 {
-  while (n >= 4) {
-    minmax(x,x + 6);
-    minmax(x + 1,x + 7);
+  while(n >= 4)
+  {
+    minmax(x, x + 6);
+    minmax(x + 1, x + 7);
    n -= 4;
    x += 4;
  }
-  if (n > 0) {
-    minmax(x,x + 6);
-    if (n > 1) minmax(x + 1,x + 7);
+  if(n > 0)
+  {
+    minmax(x, x + 6);
+    if(n > 1)
+      minmax(x + 1, x + 7);
  }
 }

-static void multiminmax2plus14(
-  int32 *x,
-  int n)
+static void
+multiminmax2plus14(int32 *x, int n)
 {
-  while (n >= 8) {
-    minmax(x,x + 14);
-    minmax(x + 1,x + 15);
-    minmax(x + 4,x + 18);
-    minmax(x + 5,x + 19);
+  while(n >= 8)
+  {
+    minmax(x, x + 14);
+    minmax(x + 1, x + 15);
+    minmax(x + 4, x + 18);
+    minmax(x + 5, x + 19);
    n -= 8;
    x += 8;
  }
-  if (n >= 4) {
-    minmax(x,x + 14);
-    minmax(x + 1,x + 15);
+  if(n >= 4)
+  {
+    minmax(x, x + 14);
+    minmax(x + 1, x + 15);
    n -= 4;
    x += 4;
  }
-  if (n > 0) {
-    minmax(x,x + 14);
-    if (n > 1) minmax(x + 1,x + 15);
+  if(n > 0)
+  {
+    minmax(x, x + 14);
+    if(n > 1)
+      minmax(x + 1, x + 15);
  }
 }

 /* sort x[i],y[i] for i in 0,1,4,5,8,9,12,13 */
 /* all of x0...x15 and y0...y15 must exist; no aliasing */
-static inline void minmax0145891213(int32 *x,int32 *y)
+static inline void
+minmax0145891213(int32 *x, int32 *y)
 {
-  __m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
-  __m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
-  __m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
-  __m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
+  __m256i a01234567       = _mm256_loadu_si256((__m256i *)x);
+  __m256i a89101112131415 = _mm256_loadu_si256((__m256i *)(x + 8));
+  __m256i b01234567       = _mm256_loadu_si256((__m256i *)y);
+  __m256i b89101112131415 = _mm256_loadu_si256((__m256i *)(y + 8));

-  __m256i a0189451213 = _mm256_unpacklo_epi64(a01234567,a89101112131415);
-  __m256i b0189451213 = _mm256_unpacklo_epi64(b01234567,b89101112131415);
-  __m256i c0189451213 = _mm256_min_epi32(a0189451213,b0189451213);
-  __m256i d0189451213 = _mm256_max_epi32(a0189451213,b0189451213);
+  __m256i a0189451213 = _mm256_unpacklo_epi64(a01234567, a89101112131415);
+  __m256i b0189451213 = _mm256_unpacklo_epi64(b01234567, b89101112131415);
+  __m256i c0189451213 = _mm256_min_epi32(a0189451213, b0189451213);
+  __m256i d0189451213 = _mm256_max_epi32(a0189451213, b0189451213);

-  __m256i c01234567 = _mm256_blend_epi32(a01234567,c0189451213,0x33);
-  __m256i d01234567 = _mm256_blend_epi32(b01234567,d0189451213,0x33);
-  __m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213,a89101112131415);
-  __m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213,b89101112131415);
+  __m256i c01234567       = _mm256_blend_epi32(a01234567, c0189451213, 0x33);
+  __m256i d01234567       = _mm256_blend_epi32(b01234567, d0189451213, 0x33);
+  __m256i c89101112131415 = _mm256_unpackhi_epi64(c0189451213, a89101112131415);
+  __m256i d89101112131415 = _mm256_unpackhi_epi64(d0189451213, b89101112131415);

-  _mm256_storeu_si256((__m256i *) x,c01234567);
-  _mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
-  _mm256_storeu_si256((__m256i *) y,d01234567);
-  _mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
+  _mm256_storeu_si256((__m256i *)x, c01234567);
+  _mm256_storeu_si256((__m256i *)(x + 8), c89101112131415);
+  _mm256_storeu_si256((__m256i *)y, d01234567);
+  _mm256_storeu_si256((__m256i *)(y + 8), d89101112131415);
 }

 /* offset >= 30 */
-static void multiminmax2plusmore(
-  int32 *x,
-  int n,
-  int offset)
+static void
+multiminmax2plusmore(int32 *x, int n, int offset)
 {
-  while (n >= 16) {
-    minmax0145891213(x,x + offset);
+  while(n >= 16)
+  {
+    minmax0145891213(x, x + offset);
    n -= 16;
    x += 16;
  }
-  if (n >= 8) {
-    minmax(x,x + offset);
-    minmax(x + 1,x + 1 + offset);
-    minmax(x + 4,x + 4 + offset);
-    minmax(x + 5,x + 5 + offset);
+  if(n >= 8)
+  {
+    minmax(x, x + offset);
+    minmax(x + 1, x + 1 + offset);
+    minmax(x + 4, x + 4 + offset);
+    minmax(x + 5, x + 5 + offset);
    n -= 8;
    x += 8;
  }
-  if (n >= 4) {
-    minmax(x,x + offset);
-    minmax(x + 1,x + 1 + offset);
+  if(n >= 4)
+  {
+    minmax(x, x + offset);
+    minmax(x + 1, x + 1 + offset);
    n -= 4;
    x += 4;
  }
-  if (n > 0) {
-    minmax(x,x + offset);
-    if (n > 1) minmax(x + 1,x + 1 + offset);
+  if(n > 0)
+  {
+    minmax(x, x + offset);
+    if(n > 1)
+      minmax(x + 1, x + 1 + offset);
  }
 }

 /* sort x0,x1; ... sort x14, x15 */
-static inline void minmax01through1415(int32 *x)
+static inline void
+minmax01through1415(int32 *x)
 {
-  __m256i a = _mm256_loadu_si256((__m256i *) x);
-  __m256i b = _mm256_loadu_si256((__m256i *) (x + 8));
-  __m256i c = _mm256_unpacklo_epi32(a,b); /* ab0ab1ab4ab5 */
-  __m256i d = _mm256_unpackhi_epi32(a,b); /* ab2ab3ab6ab7 */
-  __m256i e = _mm256_unpacklo_epi32(c,d); /* a02b02a46b46 */
-  __m256i f = _mm256_unpackhi_epi32(c,d); /* a13b13a57b57 */
-  __m256i g = _mm256_min_epi32(e,f); /* a02b02a46b46 */
-  __m256i h = _mm256_max_epi32(e,f); /* a13b13a57b57 */
-  a = _mm256_unpacklo_epi32(g,h);
-  b = _mm256_unpackhi_epi32(g,h);
-  _mm256_storeu_si256((__m256i *) x,a);
-  _mm256_storeu_si256((__m256i *) (x + 8),b);
+  __m256i a = _mm256_loadu_si256((__m256i *)x);
+  __m256i b = _mm256_loadu_si256((__m256i *)(x + 8));
+  __m256i c = _mm256_unpacklo_epi32(a, b); /* ab0ab1ab4ab5 */
+  __m256i d = _mm256_unpackhi_epi32(a, b); /* ab2ab3ab6ab7 */
+  __m256i e = _mm256_unpacklo_epi32(c, d); /* a02b02a46b46 */
+  __m256i f = _mm256_unpackhi_epi32(c, d); /* a13b13a57b57 */
+  __m256i g = _mm256_min_epi32(e, f);      /* a02b02a46b46 */
+  __m256i h = _mm256_max_epi32(e, f);      /* a13b13a57b57 */
+  a         = _mm256_unpacklo_epi32(g, h);
+  b         = _mm256_unpackhi_epi32(g, h);
+  _mm256_storeu_si256((__m256i *)x, a);
+  _mm256_storeu_si256((__m256i *)(x + 8), b);
 }

 /* sort x0,x1; sort x2,x3; sort x4,x5; sort x6,x7 */
-static inline void minmax01234567(int32 *x)
+static inline void
+minmax01234567(int32 *x)
 {
-  __m256i a = _mm256_loadu_si256((__m256i *) x);
-  __m256i b = _mm256_shuffle_epi32(a,0xb1);
-  __m256i c = _mm256_cmpgt_epi32(a,b);
-  c = _mm256_shuffle_epi32(c,0xa0);
+  __m256i a   = _mm256_loadu_si256((__m256i *)x);
+  __m256i b   = _mm256_shuffle_epi32(a, 0xb1);
+  __m256i c   = _mm256_cmpgt_epi32(a, b);
+  c           = _mm256_shuffle_epi32(c, 0xa0);
  __m256i abc = c & (a ^ b);
  a ^= abc;
-  _mm256_storeu_si256((__m256i *) x,a);
+  _mm256_storeu_si256((__m256i *)x, a);
 }

-static void multiminmax1plus1(
-  int32 *x,
-  int n)
+static void
+multiminmax1plus1(int32 *x, int n)
 {
-  while (n >= 16) {
+  while(n >= 16)
+  {
    minmax01through1415(x);
    n -= 16;
    x += 16;
  }
-  if (n >= 8) {
+  if(n >= 8)
+  {
    minmax01234567(x);
    n -= 8;
    x += 8;
  }
-  if (n >= 4) {
-    minmax(x,x + 1);
-    minmax(x + 2,x + 3);
+  if(n >= 4)
+  {
+    minmax(x, x + 1);
+    minmax(x + 2, x + 3);
    n -= 4;
    x += 4;
  }
-  if (n >= 2) {
-    minmax(x,x + 1);
+  if(n >= 2)
+  {
+    minmax(x, x + 1);
    n -= 2;
    x += 2;
  }
-  if (n > 0)
-    minmax(x,x + 1);
+  if(n > 0)
+    minmax(x, x + 1);
 }

-static void multiminmax1(
-  int32 *x,
-  int n,
-  int offset)
+static void
+multiminmax1(int32 *x, int n, int offset)
 {
-  while (n >= 16) {
-    minmax(x,x + offset);
-    minmax(x + 2,x + 2 + offset);
-    minmax(x + 4,x + 4 + offset);
-    minmax(x + 6,x + 6 + offset);
-    minmax(x + 8,x + 8 + offset);
-    minmax(x + 10,x + 10 + offset);
-    minmax(x + 12,x + 12 + offset);
-    minmax(x + 14,x + 14 + offset);
+  while(n >= 16)
+  {
+    minmax(x, x + offset);
+    minmax(x + 2, x + 2 + offset);
+    minmax(x + 4, x + 4 + offset);
+    minmax(x + 6, x + 6 + offset);
+    minmax(x + 8, x + 8 + offset);
+    minmax(x + 10, x + 10 + offset);
+    minmax(x + 12, x + 12 + offset);
+    minmax(x + 14, x + 14 + offset);
    n -= 16;
    x += 16;
  }
-  if (n >= 8) {
-    minmax(x,x + offset);
-    minmax(x + 2,x + 2 + offset);
-    minmax(x + 4,x + 4 + offset);
-    minmax(x + 6,x + 6 + offset);
+  if(n >= 8)
+  {
+    minmax(x, x + offset);
+    minmax(x + 2, x + 2 + offset);
+    minmax(x + 4, x + 4 + offset);
+    minmax(x + 6, x + 6 + offset);
    n -= 8;
    x += 8;
  }
-  if (n >= 4) {
-    minmax(x,x + offset);
-    minmax(x + 2,x + 2 + offset);
+  if(n >= 4)
+  {
+    minmax(x, x + offset);
+    minmax(x + 2, x + 2 + offset);
    n -= 4;
    x += 4;
  }
-  if (n >= 2) {
-    minmax(x,x + offset);
+  if(n >= 2)
+  {
+    minmax(x, x + offset);
    n -= 2;
    x += 2;
  }
-  if (n > 0)
-    minmax(x,x + offset);
+  if(n > 0)
+    minmax(x, x + offset);
 }

 /* sort x[i],y[i] for i in 0,2,4,6,8,10,12,14 */
 /* all of x0...x15 and y0...y15 must exist; no aliasing */
-static inline void minmax02468101214(int32 *x,int32 *y)
+static inline void
+minmax02468101214(int32 *x, int32 *y)
 {
-  __m256i a01234567 = _mm256_loadu_si256((__m256i *) x);
-  __m256i a89101112131415 = _mm256_loadu_si256((__m256i *) (x + 8));
-  __m256i b01234567 = _mm256_loadu_si256((__m256i *) y);
-  __m256i b89101112131415 = _mm256_loadu_si256((__m256i *) (y + 8));
+  __m256i a01234567       = _mm256_loadu_si256((__m256i *)x);
+  __m256i a89101112131415 = _mm256_loadu_si256((__m256i *)(x + 8));
+  __m256i b01234567       = _mm256_loadu_si256((__m256i *)y);
+  __m256i b89101112131415 = _mm256_loadu_si256((__m256i *)(y + 8));

-  __m256i a0819412513 = _mm256_unpacklo_epi32(a01234567,a89101112131415);
-  __m256i a210311614715 = _mm256_unpackhi_epi32(a01234567,a89101112131415);
-  __m256i a02810461214 = _mm256_unpacklo_epi32(a0819412513,a210311614715);
-  __m256i a13911571315 = _mm256_unpackhi_epi32(a0819412513,a210311614715);
+  __m256i a0819412513   = _mm256_unpacklo_epi32(a01234567, a89101112131415);
+  __m256i a210311614715 = _mm256_unpackhi_epi32(a01234567, a89101112131415);
+  __m256i a02810461214  = _mm256_unpacklo_epi32(a0819412513, a210311614715);
+  __m256i a13911571315  = _mm256_unpackhi_epi32(a0819412513, a210311614715);

-  __m256i b0819412513 = _mm256_unpacklo_epi32(b01234567,b89101112131415);
-  __m256i b210311614715 = _mm256_unpackhi_epi32(b01234567,b89101112131415);
-  __m256i b02810461214 = _mm256_unpacklo_epi32(b0819412513,b210311614715);
-  __m256i b13911571315 = _mm256_unpackhi_epi32(b0819412513,b210311614715);
+  __m256i b0819412513   = _mm256_unpacklo_epi32(b01234567, b89101112131415);
+  __m256i b210311614715 = _mm256_unpackhi_epi32(b01234567, b89101112131415);
+  __m256i b02810461214  = _mm256_unpacklo_epi32(b0819412513, b210311614715);
+  __m256i b13911571315  = _mm256_unpackhi_epi32(b0819412513, b210311614715);

-  __m256i c02810461214 = _mm256_min_epi32(a02810461214,b02810461214);
-  __m256i d02810461214 = _mm256_max_epi32(a02810461214,b02810461214);
+  __m256i c02810461214 = _mm256_min_epi32(a02810461214, b02810461214);
+  __m256i d02810461214 = _mm256_max_epi32(a02810461214, b02810461214);

-  __m256i c01234567 = _mm256_unpacklo_epi32(c02810461214,a13911571315);
-  __m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214,a13911571315);
-  __m256i d01234567 = _mm256_unpacklo_epi32(d02810461214,b13911571315);
-  __m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214,b13911571315);
-  
-  _mm256_storeu_si256((__m256i *) x,c01234567);
-  _mm256_storeu_si256((__m256i *) (x + 8),c89101112131415);
-  _mm256_storeu_si256((__m256i *) y,d01234567);
-  _mm256_storeu_si256((__m256i *) (y + 8),d89101112131415);
+  __m256i c01234567       = _mm256_unpacklo_epi32(c02810461214, a13911571315);
+  __m256i c89101112131415 = _mm256_unpackhi_epi32(c02810461214, a13911571315);
+  __m256i d01234567       = _mm256_unpacklo_epi32(d02810461214, b13911571315);
+  __m256i d89101112131415 = _mm256_unpackhi_epi32(d02810461214, b13911571315);
+
+  _mm256_storeu_si256((__m256i *)x, c01234567);
+  _mm256_storeu_si256((__m256i *)(x + 8), c89101112131415);
+  _mm256_storeu_si256((__m256i *)y, d01234567);
+  _mm256_storeu_si256((__m256i *)(y + 8), d89101112131415);
 }

 /* assumes offset >= 31 */
-static void multiminmax1plusmore(
-  int32 *x,
-  int n,
-  int offset)
+static void
+multiminmax1plusmore(int32 *x, int n, int offset)
 {
-  while (n >= 16) {
-    minmax02468101214(x,x + offset);
+  while(n >= 16)
+  {
+    minmax02468101214(x, x + offset);
    n -= 16;
    x += 16;
  }
-  if (n >= 8) {
-    minmax(x,x + offset);
-    minmax(x + 2,x + 2 + offset);
-    minmax(x + 4,x + 4 + offset);
-    minmax(x + 6,x + 6 + offset);
+  if(n >= 8)
+  {
+    minmax(x, x + offset);
+    minmax(x + 2, x + 2 + offset);
+    minmax(x + 4, x + 4 + offset);
+    minmax(x + 6, x + 6 + offset);
    n -= 8;
    x += 8;
  }
-  if (n >= 4) {
-    minmax(x,x + offset);
-    minmax(x + 2,x + 2 + offset);
+  if(n >= 4)
+  {
+    minmax(x, x + offset);
+    minmax(x + 2, x + 2 + offset);
    n -= 4;
    x += 4;
  }
-  if (n >= 2) {
-    minmax(x,x + offset);
+  if(n >= 2)
+  {
+    minmax(x, x + offset);
    n -= 2;
    x += 2;
  }
-  if (n > 0)
-    minmax(x,x + offset);
+  if(n > 0)
+    minmax(x, x + offset);
 }

 /* sort x0,y0; sort x1,y1; ...; sort x7,y7 */
-static inline void minmax8(int32 *x,int32 *y)
+static inline void
+minmax8(int32 *x, int32 *y)
 {
-  __m256i a = _mm256_loadu_si256((__m256i *) x);
-  __m256i b = _mm256_loadu_si256((__m256i *) y);
-  _mm256_storeu_si256((__m256i *) x,_mm256_min_epi32(a,b));
-  _mm256_storeu_si256((__m256i *) y,_mm256_max_epi32(a,b));
+  __m256i a = _mm256_loadu_si256((__m256i *)x);
+  __m256i b = _mm256_loadu_si256((__m256i *)y);
+  _mm256_storeu_si256((__m256i *)x, _mm256_min_epi32(a, b));
+  _mm256_storeu_si256((__m256i *)y, _mm256_max_epi32(a, b));
 }

 /* assumes p >= 8; implies offset >= 8 */
-static void multiminmax_atleast8(int p,
-  int32 *x,
-  int n,
-  int offset)
+static void
+multiminmax_atleast8(int p, int32 *x, int n, int offset)
 {
  int i;
-  while (n >= 2 * p) {
-    for (i = 0;i < p;i += 8)
-      minmax8(x + i,x + i + offset);
+  while(n >= 2 * p)
+  {
+    for(i = 0; i < p; i += 8)
+      minmax8(x + i, x + i + offset);
    n -= 2 * p;
    x += 2 * p;
  }
-  for (i = 0;i + 8 <= n;i += 8) {
-    if (i & p) return;
-    minmax8(x + i,x + i + offset);
+  for(i = 0; i + 8 <= n; i += 8)
+  {
+    if(i & p)
+      return;
+    minmax8(x + i, x + i + offset);
  }
-  for (;i < n;++i) {
-    if (i & p) return;
-    minmax(x + i,x + i + offset);
+  for(; i < n; ++i)
+  {
+    if(i & p)
+      return;
+    minmax(x + i, x + i + offset);
  }
 }

 /* sort x0,y0; sort x1,y1; sort x2,y2; sort x3,y3 */
-static inline void minmax4(int32 *x,int32 *y)
+static inline void
+minmax4(int32 *x, int32 *y)
 {
-  __m128i a = _mm_loadu_si128((__m128i *) x);
-  __m128i b = _mm_loadu_si128((__m128i *) y);
-  _mm_storeu_si128((__m128i *) x,_mm_min_epi32(a,b));
-  _mm_storeu_si128((__m128i *) y,_mm_max_epi32(a,b));
+  __m128i a = _mm_loadu_si128((__m128i *)x);
+  __m128i b = _mm_loadu_si128((__m128i *)y);
+  _mm_storeu_si128((__m128i *)x, _mm_min_epi32(a, b));
+  _mm_storeu_si128((__m128i *)y, _mm_max_epi32(a, b));
 }

-static void multiminmax4(
-  int32 *x,
-  int n,
-  int offset)
+static void
+multiminmax4(int32 *x, int n, int offset)
 {
  int i;
-  while (n >= 8) {
-    minmax4(x,x + offset);
+  while(n >= 8)
+  {
+    minmax4(x, x + offset);
    n -= 8;
    x += 8;
  }
-  if (n >= 4)
-    minmax4(x,x + offset);
+  if(n >= 4)
+    minmax4(x, x + offset);
  else
-    for (i = 0;i < n;++i)
-      minmax(x + i,x + i + offset);
+    for(i = 0; i < n; ++i)
+      minmax(x + i, x + i + offset);
 }

-void int32_sort(int32 *x,int n)
+void
+int32_sort(int32 *x, int n)
 {
-  int top,p,q;
+  int top, p, q;

-  if (n < 2) return;
+  if(n < 2)
+    return;
  top = 1;
-  while (top < n - top) top += top;
+  while(top < n - top)
+    top += top;

-  for (p = top;p >= 8;p >>= 1) {
-    multiminmax_atleast8(p,x,n - p,p);
-    for (q = top;q > p;q >>= 1)
-      multiminmax_atleast8(p,x + p,n - q,q - p);
-  }
-  if (p >= 4) {
-    multiminmax4(x,n - 4,4);
-    for (q = top;q > 4;q >>= 1)
-      multiminmax4(x + 4,n - q,q - 4);
-  }
-  if (p >= 2) {
-    multiminmax2plus2(x,n - 2);
-    for (q = top;q >= 32;q >>= 1)
-      multiminmax2plusmore(x + 2,n - q,q - 2);
-    if (q >= 16)
-      multiminmax2plus14(x + 2,n - 16);
-    if (q >= 8)
-      multiminmax2plus6(x + 2,n - 8);
-    if (q >= 4)
-      multiminmax2plus2(x + 2,n - 4);
-  }
-  multiminmax1plus1(x,n - 1);
-  for (q = top;q >= 32;q >>= 1)
-    multiminmax1plusmore(x + 1,n - q,q - 1);
-  if (q >= 16)
-    multiminmax1(x + 1,n - 16,15);
-  if (q >= 8)
-    multiminmax1(x + 1,n - 8,7);
-  if (q >= 4)
-    multiminmax1(x + 1,n - 4,3);
-  if (q >= 2)
-    multiminmax1plus1(x + 1,n - 2);
+  for(p = top; p >= 8; p >>= 1)
+  {
+    multiminmax_atleast8(p, x, n - p, p);
+    for(q = top; q > p; q >>= 1)
+      multiminmax_atleast8(p, x + p, n - q, q - p);
+  }
+  if(p >= 4)
+  {
+    multiminmax4(x, n - 4, 4);
+    for(q = top; q > 4; q >>= 1)
+      multiminmax4(x + 4, n - q, q - 4);
+  }
+  if(p >= 2)
+  {
+    multiminmax2plus2(x, n - 2);
+    for(q = top; q >= 32; q >>= 1)
+      multiminmax2plusmore(x + 2, n - q, q - 2);
+    if(q >= 16)
+      multiminmax2plus14(x + 2, n - 16);
+    if(q >= 8)
+      multiminmax2plus6(x + 2, n - 8);
+    if(q >= 4)
+      multiminmax2plus2(x + 2, n - 4);
+  }
+  multiminmax1plus1(x, n - 1);
+  for(q = top; q >= 32; q >>= 1)
+    multiminmax1plusmore(x + 1, n - q, q - 1);
+  if(q >= 16)
+    multiminmax1(x + 1, n - 16, 15);
+  if(q >= 8)
+    multiminmax1(x + 1, n - 8, 7);
+  if(q >= 4)
+    multiminmax1(x + 1, n - 4, 3);
+  if(q >= 2)
+    multiminmax1plus1(x + 1, n - 2);
 }
 #endif
--- a/crypto/libntrup/src/avx/r3.h
+++ b/crypto/libntrup/src/avx/r3.h
@ -4,12 +4,15 @@
 #include "small.h"

 #define r3_mult crypto_kem_sntrup4591761_avx_r3_mult
-extern void r3_mult(small *,const small *,const small *);
+extern void
+r3_mult(small *, const small *, const small *);

 #define r3_recip crypto_kem_sntrup4591761_avx_r3_recip
-extern int r3_recip(small *,const small *);
+extern int
+r3_recip(small *, const small *);

 #define r3_weightw_mask crypto_kem_sntrup4591761_avx_r3_weightw_mask
-extern int r3_weightw_mask(const small *);
+extern int
+r3_weightw_mask(const small *);

 #endif
--- a/crypto/libntrup/src/avx/r3_recip.c
+++ b/crypto/libntrup/src/avx/r3_recip.c
@ -6,91 +6,102 @@
 #include "r3.h"

 /* caller must ensure that x-y does not overflow */
-static int smaller_mask(int x,int y)
+static int
+smaller_mask(int x, int y)
 {
  return (x - y) >> 31;
 }

-static void vectormod3_product(small *z,int len,const small *x,const small c)
+static void
+vectormod3_product(small *z, int len, const small *x, const small c)
 {
  int i;
  int minusmask = c;
-  int plusmask = -c;
+  int plusmask  = -c;
  __m256i minusvec, plusvec, zerovec;

  minusmask >>= 31;
  plusmask >>= 31;
  minusvec = _mm256_set1_epi32(minusmask);
-  plusvec = _mm256_set1_epi32(plusmask);
-  zerovec = _mm256_set1_epi32(0);
-
-  while (len >= 32) {
-    __m256i xi = _mm256_loadu_si256((__m256i *) x);
-    xi = (xi & plusvec) | (_mm256_sub_epi8(zerovec,xi) & minusvec);
-    _mm256_storeu_si256((__m256i *) z,xi);
+  plusvec  = _mm256_set1_epi32(plusmask);
+  zerovec  = _mm256_set1_epi32(0);
+
+  while(len >= 32)
+  {
+    __m256i xi = _mm256_loadu_si256((__m256i *)x);
+    xi         = (xi & plusvec) | (_mm256_sub_epi8(zerovec, xi) & minusvec);
+    _mm256_storeu_si256((__m256i *)z, xi);
    x += 32;
    z += 32;
    len -= 32;
  }

-  for (i = 0;i < len;++i) z[i] = mod3_product(x[i],c);
+  for(i = 0; i < len; ++i)
+    z[i] = mod3_product(x[i], c);
 }

-static void vectormod3_minusproduct(small *z,int len,const small *x,const small *y,const small c)
+static void
+vectormod3_minusproduct(small *z, int len, const small *x, const small *y,
+                        const small c)
 {
  int i;
  int minusmask = c;
-  int plusmask = -c;
+  int plusmask  = -c;
  __m256i minusvec, plusvec, zerovec, twovec, fourvec;

  minusmask >>= 31;
  plusmask >>= 31;
  minusvec = _mm256_set1_epi32(minusmask);
-  plusvec = _mm256_set1_epi32(plusmask);
-  zerovec = _mm256_set1_epi32(0);
-  twovec = _mm256_set1_epi32(0x02020202);
-  fourvec = _mm256_set1_epi32(0x04040404);
-
-  while (len >= 32) {
-    __m256i xi = _mm256_loadu_si256((__m256i *) x);
-    __m256i yi = _mm256_loadu_si256((__m256i *) y);
+  plusvec  = _mm256_set1_epi32(plusmask);
+  zerovec  = _mm256_set1_epi32(0);
+  twovec   = _mm256_set1_epi32(0x02020202);
+  fourvec  = _mm256_set1_epi32(0x04040404);
+
+  while(len >= 32)
+  {
+    __m256i xi = _mm256_loadu_si256((__m256i *)x);
+    __m256i yi = _mm256_loadu_si256((__m256i *)y);
    __m256i r;
-    yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec,yi) & minusvec);
-    xi = _mm256_sub_epi8(xi,yi);
+    yi = (yi & plusvec) | (_mm256_sub_epi8(zerovec, yi) & minusvec);
+    xi = _mm256_sub_epi8(xi, yi);

-    r = _mm256_add_epi8(xi,twovec);
+    r = _mm256_add_epi8(xi, twovec);
    r &= fourvec;
-    r = _mm256_srli_epi32(r,2);
-    xi = _mm256_sub_epi8(xi,r);
-    r = _mm256_add_epi8(r,r);
-    xi = _mm256_sub_epi8(xi,r);
+    r  = _mm256_srli_epi32(r, 2);
+    xi = _mm256_sub_epi8(xi, r);
+    r  = _mm256_add_epi8(r, r);
+    xi = _mm256_sub_epi8(xi, r);

-    r = _mm256_sub_epi8(twovec,xi);
+    r = _mm256_sub_epi8(twovec, xi);
    r &= fourvec;
-    r = _mm256_srli_epi32(r,2);
-    xi = _mm256_add_epi8(xi,r);
-    r = _mm256_add_epi8(r,r);
-    xi = _mm256_add_epi8(xi,r);
+    r  = _mm256_srli_epi32(r, 2);
+    xi = _mm256_add_epi8(xi, r);
+    r  = _mm256_add_epi8(r, r);
+    xi = _mm256_add_epi8(xi, r);

-    _mm256_storeu_si256((__m256i *) z,xi);
+    _mm256_storeu_si256((__m256i *)z, xi);
    x += 32;
    y += 32;
    z += 32;
    len -= 32;
  }

-  for (i = 0;i < len;++i) z[i] = mod3_minusproduct(x[i],y[i],c);
+  for(i = 0; i < len; ++i)
+    z[i] = mod3_minusproduct(x[i], y[i], c);
 }

-static void vectormod3_shift(small *z,int len)
+static void
+vectormod3_shift(small *z, int len)
 {
  int i;
-  while (len >= 33) {
-    __m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 33));
-    _mm256_storeu_si256((__m256i *) (z + len - 32),zi);
+  while(len >= 33)
+  {
+    __m256i zi = _mm256_loadu_si256((__m256i *)(z + len - 33));
+    _mm256_storeu_si256((__m256i *)(z + len - 32), zi);
    len -= 32;
  }
-  for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
+  for(i = len - 1; i > 0; --i)
+    z[i] = z[i - 1];
  z[0] = 0;
 }

@ -100,12 +111,13 @@ or returning -1 if s is not invertible mod m
 r,s are polys of degree <p
 m is x^p-x-1
 */
-int r3_recip(small *r,const small *s)
+int
+r3_recip(small *r, const small *s)
 {
-  const int loops = 2*p + 1;
+  const int loops = 2 * p + 1;
  int loop;
-  small f[768]; 
-  small g[769]; 
+  small f[768];
+  small g[769];
  small u[1536];
  small v[1537];
  small c;
@ -114,23 +126,28 @@ int r3_recip(small *r,const small *s)
  int e = p;
  int swapmask;

-  for (i = 2;i < p;++i) f[i] = 0;
+  for(i = 2; i < p; ++i)
+    f[i] = 0;
  f[0] = -1;
  f[1] = -1;
  f[p] = 1;
  /* generalization: can initialize f to any polynomial m */
  /* requirements: m has degree exactly p, nonzero constant coefficient */

-  for (i = 0;i < p;++i) g[i] = s[i];
+  for(i = 0; i < p; ++i)
+    g[i] = s[i];
  g[p] = 0;

-  for (i = 0;i <= loops;++i) u[i] = 0;
+  for(i = 0; i <= loops; ++i)
+    u[i] = 0;

  v[0] = 1;
-  for (i = 1;i <= loops;++i) v[i] = 0;
+  for(i = 1; i <= loops; ++i)
+    v[i] = 0;

  loop = 0;
-  for (;;) {
+  for(;;)
+  {
    /* e == -1 or d + e + loop <= 2*p */

    /* f has degree p: i.e., f[p]!=0 */
@ -141,29 +158,35 @@ int r3_recip(small *r,const small *s)

    /* u has degree <=loop (so it fits in loop+1 coefficients) */
    /* u[i]==0 for i < p-d */
-    /* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
+    /* if invertible: u[i]==0 for i < loop-p (so can look at just p+1
+     * coefficients) */

    /* v has degree <=loop (so it fits in loop+1 coefficients) */
    /* v[i]==0 for i < p-e */
    /* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */

-    if (loop >= loops) break;
+    if(loop >= loops)
+      break;

-    c = mod3_quotient(g[p],f[p]);
+    c = mod3_quotient(g[p], f[p]);

-    vectormod3_minusproduct(g,768,g,f,c);
-    vectormod3_shift(g,769);
+    vectormod3_minusproduct(g, 768, g, f, c);
+    vectormod3_shift(g, 769);

 #ifdef SIMPLER
-    vectormod3_minusproduct(v,1536,v,u,c);
-    vectormod3_shift(v,1537);
+    vectormod3_minusproduct(v, 1536, v, u, c);
+    vectormod3_shift(v, 1537);
 #else
-    if (loop < p) {
-      vectormod3_minusproduct(v,loop + 1,v,u,c);
-      vectormod3_shift(v,loop + 2);
-    } else {
-      vectormod3_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
-      vectormod3_shift(v + loop - p,p + 2);
+    if(loop < p)
+    {
+      vectormod3_minusproduct(v, loop + 1, v, u, c);
+      vectormod3_shift(v, loop + 2);
+    }
+    else
+    {
+      vectormod3_minusproduct(v + loop - p, p + 1, v + loop - p, u + loop - p,
+                              c);
+      vectormod3_shift(v + loop - p, p + 2);
    }
 #endif

@ -171,24 +194,28 @@ int r3_recip(small *r,const small *s)

    ++loop;

-    swapmask = smaller_mask(e,d) & mod3_nonzero_mask(g[p]);
-    swap(&e,&d,sizeof e,swapmask);
-    swap(f,g,(p + 1) * sizeof(small),swapmask);
+    swapmask = smaller_mask(e, d) & mod3_nonzero_mask(g[p]);
+    swap(&e, &d, sizeof e, swapmask);
+    swap(f, g, (p + 1) * sizeof(small), swapmask);

 #ifdef SIMPLER
-    swap(u,v,1536 * sizeof(small),swapmask);
+    swap(u, v, 1536 * sizeof(small), swapmask);
 #else
-    if (loop < p) {
-      swap(u,v,(loop + 1) * sizeof(small),swapmask);
-    } else {
-      swap(u + loop - p,v + loop - p,(p + 1) * sizeof(small),swapmask);
+    if(loop < p)
+    {
+      swap(u, v, (loop + 1) * sizeof(small), swapmask);
+    }
+    else
+    {
+      swap(u + loop - p, v + loop - p, (p + 1) * sizeof(small), swapmask);
    }
 #endif
  }

  c = mod3_reciprocal(f[p]);
-  vectormod3_product(r,p,u + p,c);
-  for (i = p;i < 768;++i) r[i] = 0;
-  return smaller_mask(0,d);
+  vectormod3_product(r, p, u + p, c);
+  for(i = p; i < 768; ++i)
+    r[i] = 0;
+  return smaller_mask(0, d);
 }
 #endif
--- a/crypto/libntrup/src/avx/rq.h
+++ b/crypto/libntrup/src/avx/rq.h
@ -5,27 +5,35 @@
 #include "small.h"

 #define rq_encode crypto_kem_sntrup4591761_avx_rq_encode
-extern void rq_encode(unsigned char *,const modq *);
+extern void
+rq_encode(unsigned char *, const modq *);

 #define rq_decode crypto_kem_sntrup4591761_avx_rq_decode
-extern void rq_decode(modq *,const unsigned char *);
+extern void
+rq_decode(modq *, const unsigned char *);

 #define rq_roundencode crypto_kem_sntrup4591761_avx_rq_roundencode
-extern void rq_roundencode(unsigned char *,const modq *);
+extern void
+rq_roundencode(unsigned char *, const modq *);

 #define rq_decoderounded crypto_kem_sntrup4591761_avx_rq_decoderounded
-extern void rq_decoderounded(modq *,const unsigned char *);
+extern void
+rq_decoderounded(modq *, const unsigned char *);

 #define rq_round3 crypto_kem_sntrup4591761_avx_rq_round3
-extern void rq_round3(modq *,const modq *);
+extern void
+rq_round3(modq *, const modq *);

 #define rq_mod3 crypto_kem_sntrup4591761_avx_rq_mod3
-extern void rq_mod3(small *,const modq *);
+extern void
+rq_mod3(small *, const modq *);

 #define rq_mult crypto_kem_sntrup4591761_avx_rq_mult
-extern void rq_mult(modq *,const modq *,const small *);
+extern void
+rq_mult(modq *, const modq *, const small *);

 #define rq_recip3 crypto_kem_sntrup4591761_avx_rq_recip3
-int rq_recip3(modq *,const small *);
+int
+rq_recip3(modq *, const small *);

 #endif
--- a/crypto/libntrup/src/avx/rq_mod3.c
+++ b/crypto/libntrup/src/avx/rq_mod3.c
@ -12,47 +12,57 @@

 // 32-bit hosts only
 #ifndef __amd64__
-#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
-                                                  __a[N];}))
+#define _mm_extract_epi64(X, N) \
+  (__extension__({              \
+    __v2di __a = (__v2di)(X);   \
+    __a[N];                     \
+  }))
 #endif

-static inline __m256i squeeze(__m256i x)
+static inline __m256i
+squeeze(__m256i x)
 {
-  __m256i q = _mm256_mulhrs_epi16(x,v7);
-  q = _mm256_mullo_epi16(q,v4591_16);
-  return _mm256_sub_epi16(x,q);
+  __m256i q = _mm256_mulhrs_epi16(x, v7);
+  q         = _mm256_mullo_epi16(q, v4591_16);
+  return _mm256_sub_epi16(x, q);
 }

-static inline __m256i freeze(__m256i x)
+static inline __m256i
+freeze(__m256i x)
 {
  __m256i mask, x2296, x4591;
-  x4591 = _mm256_add_epi16(x,v4591_16);
-  mask = _mm256_srai_epi16(x,15);
-  x = _mm256_blendv_epi8(x,x4591,mask);
-  x2296 = _mm256_sub_epi16(x,v2296_16);
-  mask = _mm256_srai_epi16(x2296,15);
-  x4591 = _mm256_sub_epi16(x,v4591_16);
-  x = _mm256_blendv_epi8(x4591,x,mask);
+  x4591 = _mm256_add_epi16(x, v4591_16);
+  mask  = _mm256_srai_epi16(x, 15);
+  x     = _mm256_blendv_epi8(x, x4591, mask);
+  x2296 = _mm256_sub_epi16(x, v2296_16);
+  mask  = _mm256_srai_epi16(x2296, 15);
+  x4591 = _mm256_sub_epi16(x, v4591_16);
+  x     = _mm256_blendv_epi8(x4591, x, mask);
  return x;
 }

-void rq_mod3(small *g,const modq *f)
+void
+rq_mod3(small *g, const modq *f)
 {
  int i;

-  for (i = 0;i < 768;i += 16) {
-    __m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
+  for(i = 0; i < 768; i += 16)
+  {
+    __m256i x = _mm256_loadu_si256((__m256i *)&f[i]);
    __m256i q;
-    x = _mm256_mullo_epi16(x,v3);
+    x = _mm256_mullo_epi16(x, v3);
    x = squeeze(x);
    x = freeze(x);
-    q = _mm256_mulhrs_epi16(x,v10923_16);
-    x = _mm256_sub_epi16(x,q);
-    q = _mm256_add_epi16(q,q);
-    x = _mm256_sub_epi16(x,q); /* g0 g1 ... g15 */
-    x = _mm256_packs_epi16(x,x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */
-    0[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,0),0);
-    1[(long long *) &g[i]] = _mm_extract_epi64(_mm256_extracti128_si256(x,1),0);
+    q = _mm256_mulhrs_epi16(x, v10923_16);
+    x = _mm256_sub_epi16(x, q);
+    q = _mm256_add_epi16(q, q);
+    x = _mm256_sub_epi16(x, q); /* g0 g1 ... g15 */
+    x = _mm256_packs_epi16(x,
+                           x); /* g0 ... g7 g0 ... g7 g8 ... g15 g8 ... g15 */
+    0 [(long long *)&g[i]] =
+        _mm_extract_epi64(_mm256_extracti128_si256(x, 0), 0);
+    1 [(long long *)&g[i]] =
+        _mm_extract_epi64(_mm256_extracti128_si256(x, 1), 0);
  }
 }
 #endif
--- a/crypto/libntrup/src/avx/rq_recip3.c
+++ b/crypto/libntrup/src/avx/rq_recip3.c
@ -10,93 +10,103 @@
 #define v29234_16 _mm256_set1_epi16(29234)

 /* caller must ensure that x-y does not overflow */
-static int smaller_mask(int x,int y)
+static int
+smaller_mask(int x, int y)
 {
  return (x - y) >> 31;
 }

-static inline __m256i product(__m256i x,__m256i y)
+static inline __m256i
+product(__m256i x, __m256i y)
 {
  __m256i lo, hi, r0, r1, t0, t1, t, s0, s1;

-  lo = _mm256_mullo_epi16(x,y);
-  hi = _mm256_mulhi_epi16(x,y);
-  r0 = _mm256_unpacklo_epi16(lo,hi);
-  r1 = _mm256_unpackhi_epi16(lo,hi);
-
-  t0 = _mm256_srai_epi32(r0,16);
-  t1 = _mm256_srai_epi32(r1,16);
-  t = _mm256_packs_epi32(t0,t1);
-  t = _mm256_mulhrs_epi16(t,v29234_16);
-  lo = _mm256_mullo_epi16(t,v4591_16);
-  hi = _mm256_mulhi_epi16(t,v4591_16);
-  s0 = _mm256_unpacklo_epi16(lo,hi);
-  s1 = _mm256_unpackhi_epi16(lo,hi);
-  s0 = _mm256_slli_epi32(s0,4);
-  s1 = _mm256_slli_epi32(s1,4);
-  r0 = _mm256_sub_epi32(r0,s0);
-  r1 = _mm256_sub_epi32(r1,s1);
-
-  t0 = _mm256_srai_epi32(r0,8);
-  t1 = _mm256_srai_epi32(r1,8);
-  t = _mm256_packs_epi32(t0,t1);
-  t = _mm256_mulhrs_epi16(t,v1827_16);
-  lo = _mm256_mullo_epi16(t,v4591_16);
-  hi = _mm256_mulhi_epi16(t,v4591_16);
-  s0 = _mm256_unpacklo_epi16(lo,hi);
-  s1 = _mm256_unpackhi_epi16(lo,hi);
-  r0 = _mm256_sub_epi32(r0,s0);
-  r1 = _mm256_sub_epi32(r1,s1);
-
-  x = _mm256_packs_epi32(r0,r1);
+  lo = _mm256_mullo_epi16(x, y);
+  hi = _mm256_mulhi_epi16(x, y);
+  r0 = _mm256_unpacklo_epi16(lo, hi);
+  r1 = _mm256_unpackhi_epi16(lo, hi);
+
+  t0 = _mm256_srai_epi32(r0, 16);
+  t1 = _mm256_srai_epi32(r1, 16);
+  t  = _mm256_packs_epi32(t0, t1);
+  t  = _mm256_mulhrs_epi16(t, v29234_16);
+  lo = _mm256_mullo_epi16(t, v4591_16);
+  hi = _mm256_mulhi_epi16(t, v4591_16);
+  s0 = _mm256_unpacklo_epi16(lo, hi);
+  s1 = _mm256_unpackhi_epi16(lo, hi);
+  s0 = _mm256_slli_epi32(s0, 4);
+  s1 = _mm256_slli_epi32(s1, 4);
+  r0 = _mm256_sub_epi32(r0, s0);
+  r1 = _mm256_sub_epi32(r1, s1);
+
+  t0 = _mm256_srai_epi32(r0, 8);
+  t1 = _mm256_srai_epi32(r1, 8);
+  t  = _mm256_packs_epi32(t0, t1);
+  t  = _mm256_mulhrs_epi16(t, v1827_16);
+  lo = _mm256_mullo_epi16(t, v4591_16);
+  hi = _mm256_mulhi_epi16(t, v4591_16);
+  s0 = _mm256_unpacklo_epi16(lo, hi);
+  s1 = _mm256_unpackhi_epi16(lo, hi);
+  r0 = _mm256_sub_epi32(r0, s0);
+  r1 = _mm256_sub_epi32(r1, s1);
+
+  x = _mm256_packs_epi32(r0, r1);
  return x;
 }

-static inline __m256i minusproduct(__m256i x,__m256i y,__m256i z)
+static inline __m256i
+minusproduct(__m256i x, __m256i y, __m256i z)
 {
  __m256i t;

-  x = _mm256_sub_epi16(x,product(y,z));
-  t = _mm256_mulhrs_epi16(x,v7);
-  t = _mm256_mullo_epi16(t,v4591_16);
-  x = _mm256_sub_epi16(x,t);
+  x = _mm256_sub_epi16(x, product(y, z));
+  t = _mm256_mulhrs_epi16(x, v7);
+  t = _mm256_mullo_epi16(t, v4591_16);
+  x = _mm256_sub_epi16(x, t);
  return x;
 }

-static void vectormodq_product(modq *z,int len,const modq *x,const modq c)
+static void
+vectormodq_product(modq *z, int len, const modq *x, const modq c)
 {
  __m256i cvec = _mm256_set1_epi16(c);
-  while (len >= 16) {
-    __m256i xi = _mm256_loadu_si256((__m256i *) x);
-    xi = product(xi,cvec);
-    _mm256_storeu_si256((__m256i *) z,xi);
+  while(len >= 16)
+  {
+    __m256i xi = _mm256_loadu_si256((__m256i *)x);
+    xi         = product(xi, cvec);
+    _mm256_storeu_si256((__m256i *)z, xi);
    x += 16;
    z += 16;
    len -= 16;
  }
-  while (len > 0) {
-    *z = modq_product(*x,c);
+  while(len > 0)
+  {
+    *z = modq_product(*x, c);
    ++x;
    ++z;
    --len;
  }
 }

-static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,const modq c)
+static void
+vectormodq_minusproduct(modq *z, int len, const modq *x, const modq *y,
+                        const modq c)
 {
  __m256i cvec = _mm256_set1_epi16(c);
-  while (len >= 16) {
-    __m256i xi = _mm256_loadu_si256((__m256i *) x);
-    __m256i yi = _mm256_loadu_si256((__m256i *) y);
-    xi = minusproduct(xi,yi,cvec);
-    _mm256_storeu_si256((__m256i *) z,xi);
+  while(len >= 16)
+  {
+    __m256i xi = _mm256_loadu_si256((__m256i *)x);
+    __m256i yi = _mm256_loadu_si256((__m256i *)y);
+    xi         = minusproduct(xi, yi, cvec);
+    _mm256_storeu_si256((__m256i *)z, xi);
    x += 16;
    y += 16;
    z += 16;
    len -= 16;
  }
-  while (len > 0) {
-    *z = modq_minusproduct(*x,*y,c);
+  while(len > 0)
+  {
+    *z = modq_minusproduct(*x, *y, c);
    ++x;
    ++y;
    ++z;
@ -104,15 +114,18 @@ static void vectormodq_minusproduct(modq *z,int len,const modq *x,const modq *y,
  }
 }

-static void vectormodq_shift(modq *z,int len)
+static void
+vectormodq_shift(modq *z, int len)
 {
  int i;
-  while (len >= 17) {
-    __m256i zi = _mm256_loadu_si256((__m256i *) (z + len - 17));
-    _mm256_storeu_si256((__m256i *) (z + len - 16),zi);
+  while(len >= 17)
+  {
+    __m256i zi = _mm256_loadu_si256((__m256i *)(z + len - 17));
+    _mm256_storeu_si256((__m256i *)(z + len - 16), zi);
    len -= 16;
  }
-  for (i = len - 1;i > 0;--i) z[i] = z[i - 1];
+  for(i = len - 1; i > 0; --i)
+    z[i] = z[i - 1];
  z[0] = 0;
 }

@ -122,9 +135,10 @@ or returning -1 if s is not invertible mod m
 r,s are polys of degree <p
 m is x^p-x-1
 */
-int rq_recip3(modq *r,const small *s)
+int
+rq_recip3(modq *r, const small *s)
 {
-  const int loops = 2*p + 1;
+  const int loops = 2 * p + 1;
  int loop;
  modq f[768];
  modq g[769];
@ -136,23 +150,28 @@ int rq_recip3(modq *r,const small *s)
  int e = p;
  int swapmask;

-  for (i = 2;i < p;++i) f[i] = 0;
+  for(i = 2; i < p; ++i)
+    f[i] = 0;
  f[0] = -1;
  f[1] = -1;
  f[p] = 1;
  /* generalization: can initialize f to any polynomial m */
  /* requirements: m has degree exactly p, nonzero constant coefficient */

-  for (i = 0;i < p;++i) g[i] = 3 * s[i];
+  for(i = 0; i < p; ++i)
+    g[i] = 3 * s[i];
  g[p] = 0;

-  for (i = 0;i <= loops;++i) u[i] = 0;
+  for(i = 0; i <= loops; ++i)
+    u[i] = 0;

  v[0] = 1;
-  for (i = 1;i <= loops;++i) v[i] = 0;
+  for(i = 1; i <= loops; ++i)
+    v[i] = 0;

  loop = 0;
-  for (;;) {
+  for(;;)
+  {
    /* e == -1 or d + e + loop <= 2*p */

    /* f has degree p: i.e., f[p]!=0 */
@ -163,29 +182,35 @@ int rq_recip3(modq *r,const small *s)

    /* u has degree <=loop (so it fits in loop+1 coefficients) */
    /* u[i]==0 for i < p-d */
-    /* if invertible: u[i]==0 for i < loop-p (so can look at just p+1 coefficients) */
+    /* if invertible: u[i]==0 for i < loop-p (so can look at just p+1
+     * coefficients) */

    /* v has degree <=loop (so it fits in loop+1 coefficients) */
    /* v[i]==0 for i < p-e */
    /* v[i]==0 for i < loop-p (so can look at just p+1 coefficients) */

-    if (loop >= loops) break;
+    if(loop >= loops)
+      break;

-    c = modq_quotient(g[p],f[p]);
+    c = modq_quotient(g[p], f[p]);

-    vectormodq_minusproduct(g,768,g,f,c);
-    vectormodq_shift(g,769);
+    vectormodq_minusproduct(g, 768, g, f, c);
+    vectormodq_shift(g, 769);

 #ifdef SIMPLER
-    vectormodq_minusproduct(v,1536,v,u,c);
-    vectormodq_shift(v,1537);
+    vectormodq_minusproduct(v, 1536, v, u, c);
+    vectormodq_shift(v, 1537);
 #else
-    if (loop < p) {
-      vectormodq_minusproduct(v,loop + 1,v,u,c);
-      vectormodq_shift(v,loop + 2);
-    } else {
-      vectormodq_minusproduct(v + loop - p,p + 1,v + loop - p,u + loop - p,c);
-      vectormodq_shift(v + loop - p,p + 2);
+    if(loop < p)
+    {
+      vectormodq_minusproduct(v, loop + 1, v, u, c);
+      vectormodq_shift(v, loop + 2);
+    }
+    else
+    {
+      vectormodq_minusproduct(v + loop - p, p + 1, v + loop - p, u + loop - p,
+                              c);
+      vectormodq_shift(v + loop - p, p + 2);
    }
 #endif

@ -193,25 +218,30 @@ int rq_recip3(modq *r,const small *s)

    ++loop;

-    swapmask = smaller_mask(e,d) & modq_nonzero_mask(g[p]);
-    swap(&e,&d,sizeof e,swapmask);
-    swap(f,g,768 * sizeof(modq),swapmask);
+    swapmask = smaller_mask(e, d) & modq_nonzero_mask(g[p]);
+    swap(&e, &d, sizeof e, swapmask);
+    swap(f, g, 768 * sizeof(modq), swapmask);

 #ifdef SIMPLER
-    swap(u,v,1536 * sizeof(modq),swapmask);
+    swap(u, v, 1536 * sizeof(modq), swapmask);
 #else
-    if (loop < p) {
-      swap(u,v,(loop + 1) * sizeof(modq),swapmask);
-    } else {
-      swap(u + loop - p,v + loop - p,(p + 1) * sizeof(modq),swapmask);
+    if(loop < p)
+    {
+      swap(u, v, (loop + 1) * sizeof(modq), swapmask);
+    }
+    else
+    {
+      swap(u + loop - p, v + loop - p, (p + 1) * sizeof(modq), swapmask);
    }
 #endif
  }

  c = modq_reciprocal(f[p]);
-  vectormodq_product(r,p,u + p,c);
-  for (i = 0;i < p;++i) r[i] = modq_freeze(r[i]);
-  for (i = p;i < 768;++i) r[i] = 0;
-  return smaller_mask(0,d);
+  vectormodq_product(r, p, u + p, c);
+  for(i = 0; i < p; ++i)
+    r[i] = modq_freeze(r[i]);
+  for(i = p; i < 768; ++i)
+    r[i] = 0;
+  return smaller_mask(0, d);
 }
 #endif
--- a/crypto/libntrup/src/avx/rq_round3.c
+++ b/crypto/libntrup/src/avx/rq_round3.c
@ -6,17 +6,19 @@
 #define v3_16 _mm256_set1_epi16(3)
 #define v10923_16 _mm256_set1_epi16(10923)

-void rq_round3(modq *h,const modq *f)
+void
+rq_round3(modq *h, const modq *f)
 {
  int i;

-  for (i = 0;i < 768;i += 16) {
-    __m256i x = _mm256_loadu_si256((__m256i *) &f[i]);
+  for(i = 0; i < 768; i += 16)
+  {
+    __m256i x = _mm256_loadu_si256((__m256i *)&f[i]);
    __m256i x2;
-    x = _mm256_mulhrs_epi16(x,v10923_16);
-    x2 = _mm256_add_epi16(x,x);
-    x = _mm256_add_epi16(x,x2);
-    _mm256_storeu_si256((__m256i *) &h[i],x);
+    x  = _mm256_mulhrs_epi16(x, v10923_16);
+    x2 = _mm256_add_epi16(x, x);
+    x  = _mm256_add_epi16(x, x2);
+    _mm256_storeu_si256((__m256i *)&h[i], x);
  }
 }
 #endif
--- a/crypto/libntrup/src/avx/rq_rounded.c
+++ b/crypto/libntrup/src/avx/rq_rounded.c
@ -164,35 +164,40 @@ rq_decoderounded(modq *f, const unsigned char *c)
      /* x is f0 + f1*1536 + f2*1536^2 */
      /* with each f between 0 and 1530 */

-      f2 = x
+      f2 =
+          x
          * _mm256_set1_pd(
-                0.00000042385525173611114052197733521876177320564238470979034900665283203125);
+              0.00000042385525173611114052197733521876177320564238470979034900665283203125);
      f2 = floor(f2);
      x -= f2 * _mm256_set1_pd(2359296.0);

-      f1 = x
+      f1 =
+          x
          * _mm256_set1_pd(
-                0.00065104166666666673894681149903362893383018672466278076171875);
+              0.00065104166666666673894681149903362893383018672466278076171875);
      f1 = floor(f1);
      x -= f1 * _mm256_set1_pd(1536.0);

      f0 = x;

-      f2 -= _mm256_set1_pd(1531.0)
+      f2 -=
+          _mm256_set1_pd(1531.0)
          * floor(
-                f2
-                * _mm256_set1_pd(
-                      0.0006531678641410842804659875326933615724556148052215576171875));
-      f1 -= _mm256_set1_pd(1531.0)
+              f2
+              * _mm256_set1_pd(
+                  0.0006531678641410842804659875326933615724556148052215576171875));
+      f1 -=
+          _mm256_set1_pd(1531.0)
          * floor(
-                f1
-                * _mm256_set1_pd(
-                      0.0006531678641410842804659875326933615724556148052215576171875));
-      f0 -= _mm256_set1_pd(1531.0)
+              f1
+              * _mm256_set1_pd(
+                  0.0006531678641410842804659875326933615724556148052215576171875));
+      f0 -=
+          _mm256_set1_pd(1531.0)
          * floor(
-                f0
-                * _mm256_set1_pd(
-                      0.0006531678641410842804659875326933615724556148052215576171875));
+              f0
+              * _mm256_set1_pd(
+                  0.0006531678641410842804659875326933615724556148052215576171875));

      f2 *= _mm256_set1_pd(3.0);
      f2 -= _mm256_set1_pd(2295.0);
--- a/crypto/libntrup/src/avx/swap.c
+++ b/crypto/libntrup/src/avx/swap.c
@ -2,30 +2,33 @@
 #include <immintrin.h>
 #include "swap.h"

-void swap(void *x,void *y,int bytes,int mask)
+void
+swap(void *x, void *y, int bytes, int mask)
 {
-  char c = mask;
+  char c          = mask;
  __m256i maskvec = _mm256_set1_epi32(mask);
-  
-  while (bytes >= 32) {
-    __m256i xi = _mm256_loadu_si256(x);
-    __m256i yi = _mm256_loadu_si256(y);
-    __m256i xinew = _mm256_blendv_epi8(xi,yi,maskvec);
-    __m256i yinew = _mm256_blendv_epi8(yi,xi,maskvec);
-    _mm256_storeu_si256(x,xinew);
-    _mm256_storeu_si256(y,yinew);
-    x = 32 + (char *) x;
-    y = 32 + (char *) y;
+
+  while(bytes >= 32)
+  {
+    __m256i xi    = _mm256_loadu_si256(x);
+    __m256i yi    = _mm256_loadu_si256(y);
+    __m256i xinew = _mm256_blendv_epi8(xi, yi, maskvec);
+    __m256i yinew = _mm256_blendv_epi8(yi, xi, maskvec);
+    _mm256_storeu_si256(x, xinew);
+    _mm256_storeu_si256(y, yinew);
+    x = 32 + (char *)x;
+    y = 32 + (char *)y;
    bytes -= 32;
  }
-  while (bytes > 0) {
-    char xi = *(char *) x;
-    char yi = *(char *) y;
-    char t = c & (xi ^ yi);
+  while(bytes > 0)
+  {
+    char xi = *(char *)x;
+    char yi = *(char *)y;
+    char t  = c & (xi ^ yi);
    xi ^= t;
    yi ^= t;
-    *(char *) x = xi;
-    *(char *) y = yi;
+    *(char *)x = xi;
+    *(char *)y = yi;
    ++x;
    ++y;
    --bytes;
--- a/crypto/libntrup/src/avx/swap.h
+++ b/crypto/libntrup/src/avx/swap.h
@ -2,6 +2,7 @@
 #define swap_h

 #define swap crypto_kem_sntrup4591761_avx_swap
-extern void swap(void *,void *,int,int);
+extern void
+swap(void *, void *, int, int);

 #endif
--- a/crypto/libntrup/src/ntru.cpp
+++ b/crypto/libntrup/src/ntru.cpp
@ -1,7 +1,7 @@
 #include <libntrup/ntru.h>
 #include <stdbool.h>

-#include <stdio.h> // printf
+#include <stdio.h>  // printf

 #if __AVX2__
 #include <cpuid.h>
--- a/crypto/libntrup/src/ref/params.h
+++ b/crypto/libntrup/src/ref/params.h
@ -7,7 +7,7 @@
 #define qshift 2295
 #define p 761
 #ifdef _MSC_VER
-#define LOOPS 2*p+1
+#define LOOPS 2 * p + 1
 #endif
 #define w 286

--- a/crypto/libntrup/src/ref/r3.h
+++ b/crypto/libntrup/src/ref/r3.h
@ -4,9 +4,11 @@
 #include "small.h"

 #define r3_mult crypto_kem_sntrup4591761_ref_r3_mult
-extern void r3_mult(small *,const small *,const small *);
+extern void
+r3_mult(small *, const small *, const small *);

 #define r3_recip crypto_kem_sntrup4591761_ref_r3_recip
-extern int r3_recip(small *,const small *);
+extern int
+r3_recip(small *, const small *);

 #endif
--- a/crypto/libntrup/src/ref/r3_mult.c
+++ b/crypto/libntrup/src/ref/r3_mult.c
@ -2,30 +2,34 @@
 #include "mod3.h"
 #include "r3.h"

-void r3_mult(small *h,const small *f,const small *g)
+void
+r3_mult(small *h, const small *f, const small *g)
 {
  small fg[p + p - 1];
  small result;
  int i, j;

-  for (i = 0;i < p;++i) {
+  for(i = 0; i < p; ++i)
+  {
    result = 0;
-    for (j = 0;j <= i;++j)
-      result = mod3_plusproduct(result,f[j],g[i - j]);
+    for(j = 0; j <= i; ++j)
+      result = mod3_plusproduct(result, f[j], g[i - j]);
    fg[i] = result;
  }
-  for (i = p;i < p + p - 1;++i) {
+  for(i = p; i < p + p - 1; ++i)
+  {
    result = 0;
-    for (j = i - p + 1;j < p;++j)
-      result = mod3_plusproduct(result,f[j],g[i - j]);
+    for(j = i - p + 1; j < p; ++j)
+      result = mod3_plusproduct(result, f[j], g[i - j]);
    fg[i] = result;
  }

-  for (i = p + p - 2;i >= p;--i) {
-    fg[i - p] = mod3_sum(fg[i - p],fg[i]);
-    fg[i - p + 1] = mod3_sum(fg[i - p + 1],fg[i]);
+  for(i = p + p - 2; i >= p; --i)
+  {
+    fg[i - p]     = mod3_sum(fg[i - p], fg[i]);
+    fg[i - p + 1] = mod3_sum(fg[i - p + 1], fg[i]);
  }

-  for (i = 0;i < p;++i)
+  for(i = 0; i < p; ++i)
    h[i] = fg[i];
 }
--- a/crypto/libntrup/src/ref/rq.h
+++ b/crypto/libntrup/src/ref/rq.h
@ -5,24 +5,31 @@
 #include "small.h"

 #define rq_encode crypto_kem_sntrup4591761_ref_rq_encode
-extern void rq_encode(unsigned char *,const modq *);
+extern void
+rq_encode(unsigned char *, const modq *);

 #define rq_decode crypto_kem_sntrup4591761_ref_rq_decode
-extern void rq_decode(modq *,const unsigned char *);
+extern void
+rq_decode(modq *, const unsigned char *);

 #define rq_encoderounded crypto_kem_sntrup4591761_ref_rq_encoderounded
-extern void rq_encoderounded(unsigned char *,const modq *);
+extern void
+rq_encoderounded(unsigned char *, const modq *);

 #define rq_decoderounded crypto_kem_sntrup4591761_ref_rq_decoderounded
-extern void rq_decoderounded(modq *,const unsigned char *);
+extern void
+rq_decoderounded(modq *, const unsigned char *);

 #define rq_round3 crypto_kem_sntrup4591761_ref_rq_round
-extern void rq_round3(modq *,const modq *);
+extern void
+rq_round3(modq *, const modq *);

 #define rq_mult crypto_kem_sntrup4591761_ref_rq_mult
-extern void rq_mult(modq *,const modq *,const small *);
+extern void
+rq_mult(modq *, const modq *, const small *);

 #define rq_recip3 crypto_kem_sntrup4591761_ref_rq_recip3
-int rq_recip3(modq *,const small *);
+int
+rq_recip3(modq *, const small *);

 #endif
--- a/crypto/libntrup/src/ref/rq_mult.c
+++ b/crypto/libntrup/src/ref/rq_mult.c
@ -1,30 +1,34 @@
 #include "params.h"
 #include "rq.h"

-void rq_mult(modq *h,const modq *f,const small *g)
+void
+rq_mult(modq *h, const modq *f, const small *g)
 {
  modq fg[p + p - 1];
  modq result;
  int i, j;

-  for (i = 0;i < p;++i) {
+  for(i = 0; i < p; ++i)
+  {
    result = 0;
-    for (j = 0;j <= i;++j)
-      result = modq_plusproduct(result,f[j],g[i - j]);
+    for(j = 0; j <= i; ++j)
+      result = modq_plusproduct(result, f[j], g[i - j]);
    fg[i] = result;
  }
-  for (i = p;i < p + p - 1;++i) {
+  for(i = p; i < p + p - 1; ++i)
+  {
    result = 0;
-    for (j = i - p + 1;j < p;++j)
-      result = modq_plusproduct(result,f[j],g[i - j]);
+    for(j = i - p + 1; j < p; ++j)
+      result = modq_plusproduct(result, f[j], g[i - j]);
    fg[i] = result;
  }

-  for (i = p + p - 2;i >= p;--i) {
-    fg[i - p] = modq_sum(fg[i - p],fg[i]);
-    fg[i - p + 1] = modq_sum(fg[i - p + 1],fg[i]);
+  for(i = p + p - 2; i >= p; --i)
+  {
+    fg[i - p]     = modq_sum(fg[i - p], fg[i]);
+    fg[i - p + 1] = modq_sum(fg[i - p + 1], fg[i]);
  }

-  for (i = 0;i < p;++i)
+  for(i = 0; i < p; ++i)
    h[i] = fg[i];
 }
--- a/crypto/libntrup/src/ref/rq_round3.c
+++ b/crypto/libntrup/src/ref/rq_round3.c
@ -1,10 +1,11 @@
 #include "params.h"
 #include "rq.h"

-void rq_round3(modq *h,const modq *f)
+void
+rq_round3(modq *h, const modq *f)
 {
  int i;

-  for (i = 0;i < p;++i)
+  for(i = 0; i < p; ++i)
    h[i] = ((21846 * (f[i] + 2295) + 32768) >> 16) * 3 - 2295;
 }
--- a/crypto/libntrup/src/ref/small.c
+++ b/crypto/libntrup/src/ref/small.c
@ -4,34 +4,41 @@
 /* XXX: these functions rely on p mod 4 = 1 */

 /* all coefficients in -1, 0, 1 */
-void small_encode(unsigned char *c,const small *f)
+void
+small_encode(unsigned char *c, const small *f)
 {
  small c0;
  int i;

-  for (i = 0;i < p/4;++i) {
+  for(i = 0; i < p / 4; ++i)
+  {
    c0 = *f++ + 1;
    c0 += (*f++ + 1) << 2;
    c0 += (*f++ + 1) << 4;
    c0 += (*f++ + 1) << 6;
    *c++ = c0;
  }
-  c0 = *f++ + 1;
+  c0   = *f++ + 1;
  *c++ = c0;
 }

-void small_decode(small *f,const unsigned char *c)
+void
+small_decode(small *f, const unsigned char *c)
 {
  unsigned char c0;
  int i;

-  for (i = 0;i < p/4;++i) {
-    c0 = *c++;
-    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
-    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
-    *f++ = ((small) (c0 & 3)) - 1; c0 >>= 2;
-    *f++ = ((small) (c0 & 3)) - 1;
+  for(i = 0; i < p / 4; ++i)
+  {
+    c0   = *c++;
+    *f++ = ((small)(c0 & 3)) - 1;
+    c0 >>= 2;
+    *f++ = ((small)(c0 & 3)) - 1;
+    c0 >>= 2;
+    *f++ = ((small)(c0 & 3)) - 1;
+    c0 >>= 2;
+    *f++ = ((small)(c0 & 3)) - 1;
  }
-  c0 = *c++;
-  *f++ = ((small) (c0 & 3)) - 1;
+  c0   = *c++;
+  *f++ = ((small)(c0 & 3)) - 1;
 }
--- a/crypto/libntrup/src/ref/swap.c
+++ b/crypto/libntrup/src/ref/swap.c
@ -1,19 +1,21 @@
 #include "swap.h"

-void swap(void *x,void *y,int bytes,int mask)
+void
+swap(void *x, void *y, int bytes, int mask)
 {
  int i;
  char xi, yi, c, t;

  c = mask;
-  
-  for (i = 0;i < bytes;++i) {
-    xi = i[(char *) x];
-    yi = i[(char *) y];
-    t = c & (xi ^ yi);
+
+  for(i = 0; i < bytes; ++i)
+  {
+    xi = i[(char *)x];
+    yi = i[(char *)y];
+    t  = c & (xi ^ yi);
    xi ^= t;
    yi ^= t;
-    i[(char *) x] = xi;
-    i[(char *) y] = yi;
+    i[(char *)x] = xi;
+    i[(char *)y] = yi;
  }
 }
--- a/crypto/libntrup/src/ref/swap.h
+++ b/crypto/libntrup/src/ref/swap.h
@ -2,6 +2,7 @@
 #define swap_h

 #define swap crypto_kem_sntrup4591761_ref_swap
-extern void swap(void *,void *,int,int);
+extern void
+swap(void *, void *, int, int);

 #endif
--- a/crypto/libsodium/init.c
+++ b/crypto/libsodium/init.c
@ -36,7 +36,7 @@ sodium_init(void)
      return -1; /* LCOV_EXCL_LINE */
    }
    /* if we're here, we already started properly */
-    return initialized ? 0: -1;
+    return initialized ? 0 : -1;
  }
  _sodium_runtime_get_cpu_features();
  _crypto_generichash_blake2b_pick_best_implementation();
--- a/crypto/salsa20/core_salsa_ref.c
+++ b/crypto/salsa20/core_salsa_ref.c
@ -10,116 +10,116 @@ crypto_core_salsa(unsigned char *out, const unsigned char *in,
                  const unsigned char *k, const unsigned char *c,
                  const int rounds)
 {
-    uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
-        x15;
-    uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14,
-        j15;
-    int i;
+  uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
+  int i;

-    j0  = x0  = 0x61707865;
-    j5  = x5  = 0x3320646e;
-    j10 = x10 = 0x79622d32;
-    j15 = x15 = 0x6b206574;
-    if (c != NULL) {
-        j0  = x0  = LOAD32_LE(c + 0);
-        j5  = x5  = LOAD32_LE(c + 4);
-        j10 = x10 = LOAD32_LE(c + 8);
-        j15 = x15 = LOAD32_LE(c + 12);
-    }
-    j1  = x1  = LOAD32_LE(k + 0);
-    j2  = x2  = LOAD32_LE(k + 4);
-    j3  = x3  = LOAD32_LE(k + 8);
-    j4  = x4  = LOAD32_LE(k + 12);
-    j11 = x11 = LOAD32_LE(k + 16);
-    j12 = x12 = LOAD32_LE(k + 20);
-    j13 = x13 = LOAD32_LE(k + 24);
-    j14 = x14 = LOAD32_LE(k + 28);
+  j0 = x0 = 0x61707865;
+  j5 = x5 = 0x3320646e;
+  j10 = x10 = 0x79622d32;
+  j15 = x15 = 0x6b206574;
+  if(c != NULL)
+  {
+    j0 = x0 = LOAD32_LE(c + 0);
+    j5 = x5 = LOAD32_LE(c + 4);
+    j10 = x10 = LOAD32_LE(c + 8);
+    j15 = x15 = LOAD32_LE(c + 12);
+  }
+  j1 = x1 = LOAD32_LE(k + 0);
+  j2 = x2 = LOAD32_LE(k + 4);
+  j3 = x3 = LOAD32_LE(k + 8);
+  j4 = x4 = LOAD32_LE(k + 12);
+  j11 = x11 = LOAD32_LE(k + 16);
+  j12 = x12 = LOAD32_LE(k + 20);
+  j13 = x13 = LOAD32_LE(k + 24);
+  j14 = x14 = LOAD32_LE(k + 28);

-    j6  = x6  = LOAD32_LE(in + 0);
-    j7  = x7  = LOAD32_LE(in + 4);
-    j8  = x8  = LOAD32_LE(in + 8);
-    j9  = x9  = LOAD32_LE(in + 12);
+  j6 = x6 = LOAD32_LE(in + 0);
+  j7 = x7 = LOAD32_LE(in + 4);
+  j8 = x8 = LOAD32_LE(in + 8);
+  j9 = x9 = LOAD32_LE(in + 12);

-    for (i = 0; i < rounds; i += 2) {
-        x4  ^= ROTL32(x0  + x12, 7);
-        x8  ^= ROTL32(x4  + x0, 9);
-        x12 ^= ROTL32(x8  + x4, 13);
-        x0  ^= ROTL32(x12 + x8, 18);
-        x9  ^= ROTL32(x5  + x1, 7);
-        x13 ^= ROTL32(x9  + x5, 9);
-        x1  ^= ROTL32(x13 + x9, 13);
-        x5  ^= ROTL32(x1  + x13, 18);
-        x14 ^= ROTL32(x10 + x6, 7);
-        x2  ^= ROTL32(x14 + x10, 9);
-        x6  ^= ROTL32(x2  + x14, 13);
-        x10 ^= ROTL32(x6  + x2, 18);
-        x3  ^= ROTL32(x15 + x11, 7);
-        x7  ^= ROTL32(x3  + x15, 9);
-        x11 ^= ROTL32(x7  + x3, 13);
-        x15 ^= ROTL32(x11 + x7, 18);
-        x1  ^= ROTL32(x0  + x3, 7);
-        x2  ^= ROTL32(x1  + x0, 9);
-        x3  ^= ROTL32(x2  + x1, 13);
-        x0  ^= ROTL32(x3  + x2, 18);
-        x6  ^= ROTL32(x5  + x4, 7);
-        x7  ^= ROTL32(x6  + x5, 9);
-        x4  ^= ROTL32(x7  + x6, 13);
-        x5  ^= ROTL32(x4  + x7, 18);
-        x11 ^= ROTL32(x10 + x9, 7);
-        x8  ^= ROTL32(x11 + x10, 9);
-        x9  ^= ROTL32(x8  + x11, 13);
-        x10 ^= ROTL32(x9  + x8, 18);
-        x12 ^= ROTL32(x15 + x14, 7);
-        x13 ^= ROTL32(x12 + x15, 9);
-        x14 ^= ROTL32(x13 + x12, 13);
-        x15 ^= ROTL32(x14 + x13, 18);
-    }
-    STORE32_LE(out + 0,  x0  + j0);
-    STORE32_LE(out + 4,  x1  + j1);
-    STORE32_LE(out + 8,  x2  + j2);
-    STORE32_LE(out + 12, x3  + j3);
-    STORE32_LE(out + 16, x4  + j4);
-    STORE32_LE(out + 20, x5  + j5);
-    STORE32_LE(out + 24, x6  + j6);
-    STORE32_LE(out + 28, x7  + j7);
-    STORE32_LE(out + 32, x8  + j8);
-    STORE32_LE(out + 36, x9  + j9);
-    STORE32_LE(out + 40, x10 + j10);
-    STORE32_LE(out + 44, x11 + j11);
-    STORE32_LE(out + 48, x12 + j12);
-    STORE32_LE(out + 52, x13 + j13);
-    STORE32_LE(out + 56, x14 + j14);
-    STORE32_LE(out + 60, x15 + j15);
+  for(i = 0; i < rounds; i += 2)
+  {
+    x4 ^= ROTL32(x0 + x12, 7);
+    x8 ^= ROTL32(x4 + x0, 9);
+    x12 ^= ROTL32(x8 + x4, 13);
+    x0 ^= ROTL32(x12 + x8, 18);
+    x9 ^= ROTL32(x5 + x1, 7);
+    x13 ^= ROTL32(x9 + x5, 9);
+    x1 ^= ROTL32(x13 + x9, 13);
+    x5 ^= ROTL32(x1 + x13, 18);
+    x14 ^= ROTL32(x10 + x6, 7);
+    x2 ^= ROTL32(x14 + x10, 9);
+    x6 ^= ROTL32(x2 + x14, 13);
+    x10 ^= ROTL32(x6 + x2, 18);
+    x3 ^= ROTL32(x15 + x11, 7);
+    x7 ^= ROTL32(x3 + x15, 9);
+    x11 ^= ROTL32(x7 + x3, 13);
+    x15 ^= ROTL32(x11 + x7, 18);
+    x1 ^= ROTL32(x0 + x3, 7);
+    x2 ^= ROTL32(x1 + x0, 9);
+    x3 ^= ROTL32(x2 + x1, 13);
+    x0 ^= ROTL32(x3 + x2, 18);
+    x6 ^= ROTL32(x5 + x4, 7);
+    x7 ^= ROTL32(x6 + x5, 9);
+    x4 ^= ROTL32(x7 + x6, 13);
+    x5 ^= ROTL32(x4 + x7, 18);
+    x11 ^= ROTL32(x10 + x9, 7);
+    x8 ^= ROTL32(x11 + x10, 9);
+    x9 ^= ROTL32(x8 + x11, 13);
+    x10 ^= ROTL32(x9 + x8, 18);
+    x12 ^= ROTL32(x15 + x14, 7);
+    x13 ^= ROTL32(x12 + x15, 9);
+    x14 ^= ROTL32(x13 + x12, 13);
+    x15 ^= ROTL32(x14 + x13, 18);
+  }
+  STORE32_LE(out + 0, x0 + j0);
+  STORE32_LE(out + 4, x1 + j1);
+  STORE32_LE(out + 8, x2 + j2);
+  STORE32_LE(out + 12, x3 + j3);
+  STORE32_LE(out + 16, x4 + j4);
+  STORE32_LE(out + 20, x5 + j5);
+  STORE32_LE(out + 24, x6 + j6);
+  STORE32_LE(out + 28, x7 + j7);
+  STORE32_LE(out + 32, x8 + j8);
+  STORE32_LE(out + 36, x9 + j9);
+  STORE32_LE(out + 40, x10 + j10);
+  STORE32_LE(out + 44, x11 + j11);
+  STORE32_LE(out + 48, x12 + j12);
+  STORE32_LE(out + 52, x13 + j13);
+  STORE32_LE(out + 56, x14 + j14);
+  STORE32_LE(out + 60, x15 + j15);
 }

 int
 crypto_core_salsa20(unsigned char *out, const unsigned char *in,
                    const unsigned char *k, const unsigned char *c)
 {
-    crypto_core_salsa(out, in, k, c, 20);
-    return 0;
+  crypto_core_salsa(out, in, k, c, 20);
+  return 0;
 }

 size_t
 crypto_core_salsa20_outputbytes(void)
 {
-    return crypto_core_salsa20_OUTPUTBYTES;
+  return crypto_core_salsa20_OUTPUTBYTES;
 }

 size_t
 crypto_core_salsa20_inputbytes(void)
 {
-    return crypto_core_salsa20_INPUTBYTES;
+  return crypto_core_salsa20_INPUTBYTES;
 }

 size_t
 crypto_core_salsa20_keybytes(void)
 {
-    return crypto_core_salsa20_KEYBYTES;
+  return crypto_core_salsa20_KEYBYTES;
 }

 size_t
 crypto_core_salsa20_constbytes(void)
 {
-    return crypto_core_salsa20_CONSTBYTES;
+  return crypto_core_salsa20_CONSTBYTES;
 }
--- a/crypto/salsa20/ref/salsa20_ref.c
+++ b/crypto/salsa20/ref/salsa20_ref.c
@ -13,7 +13,6 @@ Public domain.
 #include "../stream_salsa20.h"
 #include "salsa20_ref.h"

-
 static int
 stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
           const unsigned char *k)
@ -132,4 +131,3 @@ struct crypto_stream_salsa20_implementation
        SODIUM_C99(.stream =) stream_ref,
        SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
 };
-
--- a/crypto/salsa20/stream_salsa20.h
+++ b/crypto/salsa20/stream_salsa20.h
@ -4,13 +4,13 @@

 #include <stdint.h>

-typedef struct crypto_stream_salsa20_implementation {
-    int (*stream)(unsigned char *c, unsigned long long clen,
-                  const unsigned char *n, const unsigned char *k);
-    int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
-                         unsigned long long mlen,
-                         const unsigned char *n, uint64_t ic,
-                         const unsigned char *k);
+typedef struct crypto_stream_salsa20_implementation
+{
+  int (*stream)(unsigned char *c, unsigned long long clen,
+                const unsigned char *n, const unsigned char *k);
+  int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
+                       unsigned long long mlen, const unsigned char *n,
+                       uint64_t ic, const unsigned char *k);
 } crypto_stream_salsa20_implementation;

 #endif
--- a/crypto/salsa20/xmm6int/u0.h
+++ b/crypto/salsa20/xmm6int/u0.h
@ -1,195 +1,199 @@
-if (bytes > 0) {
-    __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
-    __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
-    __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
-    __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
-    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
-    __m128i b0, b1, b2, b3, b4, b5, b6, b7;
-    uint8_t partialblock[64];
-
-    unsigned int i;
-
-    a0 = diag1;
-    for (i = 0; i < ROUNDS; i += 4) {
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-    }
-
-    diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
-    diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
-    diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
-    diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
-
-#define ONEQUAD_SHUFFLE(A, B, C, D)                      \
-    do {                                                 \
-        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
-        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
-        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
-        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
-        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
-        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
-        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
-        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
-        *(uint32_t *) (partialblock + (A * 4)) = in##A;  \
-        *(uint32_t *) (partialblock + (B * 4)) = in##B;  \
-        *(uint32_t *) (partialblock + (C * 4)) = in##C;  \
-        *(uint32_t *) (partialblock + (D * 4)) = in##D;  \
-    } while (0)
+if(bytes > 0)
+{
+  __m128i diag0 = _mm_loadu_si128((__m128i *)(x + 0));
+  __m128i diag1 = _mm_loadu_si128((__m128i *)(x + 4));
+  __m128i diag2 = _mm_loadu_si128((__m128i *)(x + 8));
+  __m128i diag3 = _mm_loadu_si128((__m128i *)(x + 12));
+  __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+  __m128i b0, b1, b2, b3, b4, b5, b6, b7;
+  uint8_t partialblock[64];
+
+  unsigned int i;
+
+  a0 = diag1;
+  for(i = 0; i < ROUNDS; i += 4)
+  {
+    a0    = _mm_add_epi32(a0, diag0);
+    a1    = diag0;
+    b0    = a0;
+    a0    = _mm_slli_epi32(a0, 7);
+    b0    = _mm_srli_epi32(b0, 25);
+    diag3 = _mm_xor_si128(diag3, a0);
+
+    diag3 = _mm_xor_si128(diag3, b0);
+
+    a1    = _mm_add_epi32(a1, diag3);
+    a2    = diag3;
+    b1    = a1;
+    a1    = _mm_slli_epi32(a1, 9);
+    b1    = _mm_srli_epi32(b1, 23);
+    diag2 = _mm_xor_si128(diag2, a1);
+    diag3 = _mm_shuffle_epi32(diag3, 0x93);
+    diag2 = _mm_xor_si128(diag2, b1);
+
+    a2    = _mm_add_epi32(a2, diag2);
+    a3    = diag2;
+    b2    = a2;
+    a2    = _mm_slli_epi32(a2, 13);
+    b2    = _mm_srli_epi32(b2, 19);
+    diag1 = _mm_xor_si128(diag1, a2);
+    diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+    diag1 = _mm_xor_si128(diag1, b2);
+
+    a3    = _mm_add_epi32(a3, diag1);
+    a4    = diag3;
+    b3    = a3;
+    a3    = _mm_slli_epi32(a3, 18);
+    b3    = _mm_srli_epi32(b3, 14);
+    diag0 = _mm_xor_si128(diag0, a3);
+    diag1 = _mm_shuffle_epi32(diag1, 0x39);
+    diag0 = _mm_xor_si128(diag0, b3);
+
+    a4    = _mm_add_epi32(a4, diag0);
+    a5    = diag0;
+    b4    = a4;
+    a4    = _mm_slli_epi32(a4, 7);
+    b4    = _mm_srli_epi32(b4, 25);
+    diag1 = _mm_xor_si128(diag1, a4);
+
+    diag1 = _mm_xor_si128(diag1, b4);
+
+    a5    = _mm_add_epi32(a5, diag1);
+    a6    = diag1;
+    b5    = a5;
+    a5    = _mm_slli_epi32(a5, 9);
+    b5    = _mm_srli_epi32(b5, 23);
+    diag2 = _mm_xor_si128(diag2, a5);
+    diag1 = _mm_shuffle_epi32(diag1, 0x93);
+    diag2 = _mm_xor_si128(diag2, b5);
+
+    a6    = _mm_add_epi32(a6, diag2);
+    a7    = diag2;
+    b6    = a6;
+    a6    = _mm_slli_epi32(a6, 13);
+    b6    = _mm_srli_epi32(b6, 19);
+    diag3 = _mm_xor_si128(diag3, a6);
+    diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+    diag3 = _mm_xor_si128(diag3, b6);
+
+    a7    = _mm_add_epi32(a7, diag3);
+    a0    = diag1;
+    b7    = a7;
+    a7    = _mm_slli_epi32(a7, 18);
+    b7    = _mm_srli_epi32(b7, 14);
+    diag0 = _mm_xor_si128(diag0, a7);
+    diag3 = _mm_shuffle_epi32(diag3, 0x39);
+    diag0 = _mm_xor_si128(diag0, b7);
+
+    a0    = _mm_add_epi32(a0, diag0);
+    a1    = diag0;
+    b0    = a0;
+    a0    = _mm_slli_epi32(a0, 7);
+    b0    = _mm_srli_epi32(b0, 25);
+    diag3 = _mm_xor_si128(diag3, a0);
+
+    diag3 = _mm_xor_si128(diag3, b0);
+
+    a1    = _mm_add_epi32(a1, diag3);
+    a2    = diag3;
+    b1    = a1;
+    a1    = _mm_slli_epi32(a1, 9);
+    b1    = _mm_srli_epi32(b1, 23);
+    diag2 = _mm_xor_si128(diag2, a1);
+    diag3 = _mm_shuffle_epi32(diag3, 0x93);
+    diag2 = _mm_xor_si128(diag2, b1);
+
+    a2    = _mm_add_epi32(a2, diag2);
+    a3    = diag2;
+    b2    = a2;
+    a2    = _mm_slli_epi32(a2, 13);
+    b2    = _mm_srli_epi32(b2, 19);
+    diag1 = _mm_xor_si128(diag1, a2);
+    diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+    diag1 = _mm_xor_si128(diag1, b2);
+
+    a3    = _mm_add_epi32(a3, diag1);
+    a4    = diag3;
+    b3    = a3;
+    a3    = _mm_slli_epi32(a3, 18);
+    b3    = _mm_srli_epi32(b3, 14);
+    diag0 = _mm_xor_si128(diag0, a3);
+    diag1 = _mm_shuffle_epi32(diag1, 0x39);
+    diag0 = _mm_xor_si128(diag0, b3);
+
+    a4    = _mm_add_epi32(a4, diag0);
+    a5    = diag0;
+    b4    = a4;
+    a4    = _mm_slli_epi32(a4, 7);
+    b4    = _mm_srli_epi32(b4, 25);
+    diag1 = _mm_xor_si128(diag1, a4);
+
+    diag1 = _mm_xor_si128(diag1, b4);
+
+    a5    = _mm_add_epi32(a5, diag1);
+    a6    = diag1;
+    b5    = a5;
+    a5    = _mm_slli_epi32(a5, 9);
+    b5    = _mm_srli_epi32(b5, 23);
+    diag2 = _mm_xor_si128(diag2, a5);
+    diag1 = _mm_shuffle_epi32(diag1, 0x93);
+    diag2 = _mm_xor_si128(diag2, b5);
+
+    a6    = _mm_add_epi32(a6, diag2);
+    a7    = diag2;
+    b6    = a6;
+    a6    = _mm_slli_epi32(a6, 13);
+    b6    = _mm_srli_epi32(b6, 19);
+    diag3 = _mm_xor_si128(diag3, a6);
+    diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+    diag3 = _mm_xor_si128(diag3, b6);
+
+    a7    = _mm_add_epi32(a7, diag3);
+    a0    = diag1;
+    b7    = a7;
+    a7    = _mm_slli_epi32(a7, 18);
+    b7    = _mm_srli_epi32(b7, 14);
+    diag0 = _mm_xor_si128(diag0, a7);
+    diag3 = _mm_shuffle_epi32(diag3, 0x39);
+    diag0 = _mm_xor_si128(diag0, b7);
+  }
+
+  diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *)(x + 0)));
+  diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *)(x + 4)));
+  diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *)(x + 8)));
+  diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *)(x + 12)));
+
+#define ONEQUAD_SHUFFLE(A, B, C, D)                                         \
+  do                                                                        \
+  {                                                                         \
+    uint32_t in##A                        = _mm_cvtsi128_si32(diag0);       \
+    uint32_t in##B                        = _mm_cvtsi128_si32(diag1);       \
+    uint32_t in##C                        = _mm_cvtsi128_si32(diag2);       \
+    uint32_t in##D                        = _mm_cvtsi128_si32(diag3);       \
+    diag0                                 = _mm_shuffle_epi32(diag0, 0x39); \
+    diag1                                 = _mm_shuffle_epi32(diag1, 0x39); \
+    diag2                                 = _mm_shuffle_epi32(diag2, 0x39); \
+    diag3                                 = _mm_shuffle_epi32(diag3, 0x39); \
+    *(uint32_t *)(partialblock + (A * 4)) = in##A;                          \
+    *(uint32_t *)(partialblock + (B * 4)) = in##B;                          \
+    *(uint32_t *)(partialblock + (C * 4)) = in##C;                          \
+    *(uint32_t *)(partialblock + (D * 4)) = in##D;                          \
+  } while(0)

 #define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)

-    ONEQUAD(0, 12, 8, 4);
-    ONEQUAD(5, 1, 13, 9);
-    ONEQUAD(10, 6, 2, 14);
-    ONEQUAD(15, 11, 7, 3);
+  ONEQUAD(0, 12, 8, 4);
+  ONEQUAD(5, 1, 13, 9);
+  ONEQUAD(10, 6, 2, 14);
+  ONEQUAD(15, 11, 7, 3);

 #undef ONEQUAD
 #undef ONEQUAD_SHUFFLE

-    for (i = 0; i < bytes; i++) {
-        c[i] = m[i] ^ partialblock[i];
-    }
+  for(i = 0; i < bytes; i++)
+  {
+    c[i] = m[i] ^ partialblock[i];
+  }

-    sodium_memzero(partialblock, sizeof partialblock);
+  sodium_memzero(partialblock, sizeof partialblock);
 }
--- a/crypto/salsa20/xmm6int/u1.h
+++ b/crypto/salsa20/xmm6int/u1.h
@ -1,207 +1,211 @@
-while (bytes >= 64) {
-    __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
-    __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
-    __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
-    __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
-    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
-    __m128i b0, b1, b2, b3, b4, b5, b6, b7;
-
-    uint32_t in8;
-    uint32_t in9;
-    int      i;
-
-    a0 = diag1;
-    for (i = 0; i < ROUNDS; i += 4) {
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-
-        a0    = _mm_add_epi32(a0, diag0);
-        a1    = diag0;
-        b0    = a0;
-        a0    = _mm_slli_epi32(a0, 7);
-        b0    = _mm_srli_epi32(b0, 25);
-        diag3 = _mm_xor_si128(diag3, a0);
-
-        diag3 = _mm_xor_si128(diag3, b0);
-
-        a1    = _mm_add_epi32(a1, diag3);
-        a2    = diag3;
-        b1    = a1;
-        a1    = _mm_slli_epi32(a1, 9);
-        b1    = _mm_srli_epi32(b1, 23);
-        diag2 = _mm_xor_si128(diag2, a1);
-        diag3 = _mm_shuffle_epi32(diag3, 0x93);
-        diag2 = _mm_xor_si128(diag2, b1);
-
-        a2    = _mm_add_epi32(a2, diag2);
-        a3    = diag2;
-        b2    = a2;
-        a2    = _mm_slli_epi32(a2, 13);
-        b2    = _mm_srli_epi32(b2, 19);
-        diag1 = _mm_xor_si128(diag1, a2);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag1 = _mm_xor_si128(diag1, b2);
-
-        a3    = _mm_add_epi32(a3, diag1);
-        a4    = diag3;
-        b3    = a3;
-        a3    = _mm_slli_epi32(a3, 18);
-        b3    = _mm_srli_epi32(b3, 14);
-        diag0 = _mm_xor_si128(diag0, a3);
-        diag1 = _mm_shuffle_epi32(diag1, 0x39);
-        diag0 = _mm_xor_si128(diag0, b3);
-
-        a4    = _mm_add_epi32(a4, diag0);
-        a5    = diag0;
-        b4    = a4;
-        a4    = _mm_slli_epi32(a4, 7);
-        b4    = _mm_srli_epi32(b4, 25);
-        diag1 = _mm_xor_si128(diag1, a4);
-
-        diag1 = _mm_xor_si128(diag1, b4);
-
-        a5    = _mm_add_epi32(a5, diag1);
-        a6    = diag1;
-        b5    = a5;
-        a5    = _mm_slli_epi32(a5, 9);
-        b5    = _mm_srli_epi32(b5, 23);
-        diag2 = _mm_xor_si128(diag2, a5);
-        diag1 = _mm_shuffle_epi32(diag1, 0x93);
-        diag2 = _mm_xor_si128(diag2, b5);
-
-        a6    = _mm_add_epi32(a6, diag2);
-        a7    = diag2;
-        b6    = a6;
-        a6    = _mm_slli_epi32(a6, 13);
-        b6    = _mm_srli_epi32(b6, 19);
-        diag3 = _mm_xor_si128(diag3, a6);
-        diag2 = _mm_shuffle_epi32(diag2, 0x4e);
-        diag3 = _mm_xor_si128(diag3, b6);
-
-        a7    = _mm_add_epi32(a7, diag3);
-        a0    = diag1;
-        b7    = a7;
-        a7    = _mm_slli_epi32(a7, 18);
-        b7    = _mm_srli_epi32(b7, 14);
-        diag0 = _mm_xor_si128(diag0, a7);
-        diag3 = _mm_shuffle_epi32(diag3, 0x39);
-        diag0 = _mm_xor_si128(diag0, b7);
-    }
-
-    diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
-    diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
-    diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
-    diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
-
-#define ONEQUAD_SHUFFLE(A, B, C, D)                      \
-    do {                                                 \
-        uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
-        uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
-        uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
-        uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
-        diag0          = _mm_shuffle_epi32(diag0, 0x39); \
-        diag1          = _mm_shuffle_epi32(diag1, 0x39); \
-        diag2          = _mm_shuffle_epi32(diag2, 0x39); \
-        diag3          = _mm_shuffle_epi32(diag3, 0x39); \
-        in##A ^= *(uint32_t *) (m + (A * 4));            \
-        in##B ^= *(uint32_t *) (m + (B * 4));            \
-        in##C ^= *(uint32_t *) (m + (C * 4));            \
-        in##D ^= *(uint32_t *) (m + (D * 4));            \
-        *(uint32_t *) (c + (A * 4)) = in##A;             \
-        *(uint32_t *) (c + (B * 4)) = in##B;             \
-        *(uint32_t *) (c + (C * 4)) = in##C;             \
-        *(uint32_t *) (c + (D * 4)) = in##D;             \
-    } while (0)
+while(bytes >= 64)
+{
+  __m128i diag0 = _mm_loadu_si128((__m128i *)(x + 0));
+  __m128i diag1 = _mm_loadu_si128((__m128i *)(x + 4));
+  __m128i diag2 = _mm_loadu_si128((__m128i *)(x + 8));
+  __m128i diag3 = _mm_loadu_si128((__m128i *)(x + 12));
+  __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+  __m128i b0, b1, b2, b3, b4, b5, b6, b7;
+
+  uint32_t in8;
+  uint32_t in9;
+  int i;
+
+  a0 = diag1;
+  for(i = 0; i < ROUNDS; i += 4)
+  {
+    a0    = _mm_add_epi32(a0, diag0);
+    a1    = diag0;
+    b0    = a0;
+    a0    = _mm_slli_epi32(a0, 7);
+    b0    = _mm_srli_epi32(b0, 25);
+    diag3 = _mm_xor_si128(diag3, a0);
+
+    diag3 = _mm_xor_si128(diag3, b0);
+
+    a1    = _mm_add_epi32(a1, diag3);
+    a2    = diag3;
+    b1    = a1;
+    a1    = _mm_slli_epi32(a1, 9);
+    b1    = _mm_srli_epi32(b1, 23);
+    diag2 = _mm_xor_si128(diag2, a1);
+    diag3 = _mm_shuffle_epi32(diag3, 0x93);
+    diag2 = _mm_xor_si128(diag2, b1);
+
+    a2    = _mm_add_epi32(a2, diag2);
+    a3    = diag2;
+    b2    = a2;
+    a2    = _mm_slli_epi32(a2, 13);
+    b2    = _mm_srli_epi32(b2, 19);
+    diag1 = _mm_xor_si128(diag1, a2);
+    diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+    diag1 = _mm_xor_si128(diag1, b2);
+
+    a3    = _mm_add_epi32(a3, diag1);
+    a4    = diag3;
+    b3    = a3;
+    a3    = _mm_slli_epi32(a3, 18);
+    b3    = _mm_srli_epi32(b3, 14);
+    diag0 = _mm_xor_si128(diag0, a3);
+    diag1 = _mm_shuffle_epi32(diag1, 0x39);
+    diag0 = _mm_xor_si128(diag0, b3);
+
+    a4    = _mm_add_epi32(a4, diag0);
+    a5    = diag0;
+    b4    = a4;
+    a4    = _mm_slli_epi32(a4, 7);
+    b4    = _mm_srli_epi32(b4, 25);
+    diag1 = _mm_xor_si128(diag1, a4);
+
+    diag1 = _mm_xor_si128(diag1, b4);
+
+    a5    = _mm_add_epi32(a5, diag1);
+    a6    = diag1;
+    b5    = a5;
+    a5    = _mm_slli_epi32(a5, 9);
+    b5    = _mm_srli_epi32(b5, 23);
+    diag2 = _mm_xor_si128(diag2, a5);
+    diag1 = _mm_shuffle_epi32(diag1, 0x93);
+    diag2 = _mm_xor_si128(diag2, b5);
+
+    a6    = _mm_add_epi32(a6, diag2);
+    a7    = diag2;
+    b6    = a6;
+    a6    = _mm_slli_epi32(a6, 13);
+    b6    = _mm_srli_epi32(b6, 19);
+    diag3 = _mm_xor_si128(diag3, a6);
+    diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+    diag3 = _mm_xor_si128(diag3, b6);
+
+    a7    = _mm_add_epi32(a7, diag3);
+    a0    = diag1;
+    b7    = a7;
+    a7    = _mm_slli_epi32(a7, 18);
+    b7    = _mm_srli_epi32(b7, 14);
+    diag0 = _mm_xor_si128(diag0, a7);
+    diag3 = _mm_shuffle_epi32(diag3, 0x39);
+    diag0 = _mm_xor_si128(diag0, b7);
+
+    a0    = _mm_add_epi32(a0, diag0);
+    a1    = diag0;
+    b0    = a0;
+    a0    = _mm_slli_epi32(a0, 7);
+    b0    = _mm_srli_epi32(b0, 25);
+    diag3 = _mm_xor_si128(diag3, a0);
+
+    diag3 = _mm_xor_si128(diag3, b0);
+
+    a1    = _mm_add_epi32(a1, diag3);
+    a2    = diag3;
+    b1    = a1;
+    a1    = _mm_slli_epi32(a1, 9);
+    b1    = _mm_srli_epi32(b1, 23);
+    diag2 = _mm_xor_si128(diag2, a1);
+    diag3 = _mm_shuffle_epi32(diag3, 0x93);
+    diag2 = _mm_xor_si128(diag2, b1);
+
+    a2    = _mm_add_epi32(a2, diag2);
+    a3    = diag2;
+    b2    = a2;
+    a2    = _mm_slli_epi32(a2, 13);
+    b2    = _mm_srli_epi32(b2, 19);
+    diag1 = _mm_xor_si128(diag1, a2);
+    diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+    diag1 = _mm_xor_si128(diag1, b2);
+
+    a3    = _mm_add_epi32(a3, diag1);
+    a4    = diag3;
+    b3    = a3;
+    a3    = _mm_slli_epi32(a3, 18);
+    b3    = _mm_srli_epi32(b3, 14);
+    diag0 = _mm_xor_si128(diag0, a3);
+    diag1 = _mm_shuffle_epi32(diag1, 0x39);
+    diag0 = _mm_xor_si128(diag0, b3);
+
+    a4    = _mm_add_epi32(a4, diag0);
+    a5    = diag0;
+    b4    = a4;
+    a4    = _mm_slli_epi32(a4, 7);
+    b4    = _mm_srli_epi32(b4, 25);
+    diag1 = _mm_xor_si128(diag1, a4);
+
+    diag1 = _mm_xor_si128(diag1, b4);
+
+    a5    = _mm_add_epi32(a5, diag1);
+    a6    = diag1;
+    b5    = a5;
+    a5    = _mm_slli_epi32(a5, 9);
+    b5    = _mm_srli_epi32(b5, 23);
+    diag2 = _mm_xor_si128(diag2, a5);
+    diag1 = _mm_shuffle_epi32(diag1, 0x93);
+    diag2 = _mm_xor_si128(diag2, b5);
+
+    a6    = _mm_add_epi32(a6, diag2);
+    a7    = diag2;
+    b6    = a6;
+    a6    = _mm_slli_epi32(a6, 13);
+    b6    = _mm_srli_epi32(b6, 19);
+    diag3 = _mm_xor_si128(diag3, a6);
+    diag2 = _mm_shuffle_epi32(diag2, 0x4e);
+    diag3 = _mm_xor_si128(diag3, b6);
+
+    a7    = _mm_add_epi32(a7, diag3);
+    a0    = diag1;
+    b7    = a7;
+    a7    = _mm_slli_epi32(a7, 18);
+    b7    = _mm_srli_epi32(b7, 14);
+    diag0 = _mm_xor_si128(diag0, a7);
+    diag3 = _mm_shuffle_epi32(diag3, 0x39);
+    diag0 = _mm_xor_si128(diag0, b7);
+  }
+
+  diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *)(x + 0)));
+  diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *)(x + 4)));
+  diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *)(x + 8)));
+  diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *)(x + 12)));
+
+#define ONEQUAD_SHUFFLE(A, B, C, D)                  \
+  do                                                 \
+  {                                                  \
+    uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
+    uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
+    uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
+    uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
+    diag0          = _mm_shuffle_epi32(diag0, 0x39); \
+    diag1          = _mm_shuffle_epi32(diag1, 0x39); \
+    diag2          = _mm_shuffle_epi32(diag2, 0x39); \
+    diag3          = _mm_shuffle_epi32(diag3, 0x39); \
+    in##A ^= *(uint32_t *)(m + (A * 4));             \
+    in##B ^= *(uint32_t *)(m + (B * 4));             \
+    in##C ^= *(uint32_t *)(m + (C * 4));             \
+    in##D ^= *(uint32_t *)(m + (D * 4));             \
+    *(uint32_t *)(c + (A * 4)) = in##A;              \
+    *(uint32_t *)(c + (B * 4)) = in##B;              \
+    *(uint32_t *)(c + (C * 4)) = in##C;              \
+    *(uint32_t *)(c + (D * 4)) = in##D;              \
+  } while(0)

 #define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)

-    ONEQUAD(0, 12, 8, 4);
-    ONEQUAD(5, 1, 13, 9);
-    ONEQUAD(10, 6, 2, 14);
-    ONEQUAD(15, 11, 7, 3);
+  ONEQUAD(0, 12, 8, 4);
+  ONEQUAD(5, 1, 13, 9);
+  ONEQUAD(10, 6, 2, 14);
+  ONEQUAD(15, 11, 7, 3);

 #undef ONEQUAD
 #undef ONEQUAD_SHUFFLE

-    in8 = x[8];
-    in9 = x[13];
-    in8++;
-    if (in8 == 0) {
-        in9++;
-    }
-    x[8]  = in8;
-    x[13] = in9;
-
-    c += 64;
-    m += 64;
-    bytes -= 64;
+  in8 = x[8];
+  in9 = x[13];
+  in8++;
+  if(in8 == 0)
+  {
+    in9++;
+  }
+  x[8]  = in8;
+  x[13] = in9;
+
+  c += 64;
+  m += 64;
+  bytes -= 64;
 }
--- a/crypto/salsa20/xmm6int/u4.h
+++ b/crypto/salsa20/xmm6int/u4.h
--- a/crypto/salsa20/xmm6int/u8.h
+++ b/crypto/salsa20/xmm6int/u8.h
@ -1,476 +1,471 @@
-if (bytes >= 512) {
-    __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
-        y15;
-
-    /* the naive way seems as fast (if not a bit faster) than the vector way */
-    __m256i z0  = _mm256_set1_epi32(x[0]);
-    __m256i z5  = _mm256_set1_epi32(x[1]);
-    __m256i z10 = _mm256_set1_epi32(x[2]);
-    __m256i z15 = _mm256_set1_epi32(x[3]);
-    __m256i z12 = _mm256_set1_epi32(x[4]);
-    __m256i z1  = _mm256_set1_epi32(x[5]);
-    __m256i z6  = _mm256_set1_epi32(x[6]);
-    __m256i z11 = _mm256_set1_epi32(x[7]);
-    __m256i z8; /* useless */
-    __m256i z13 = _mm256_set1_epi32(x[9]);
-    __m256i z2  = _mm256_set1_epi32(x[10]);
-    __m256i z7  = _mm256_set1_epi32(x[11]);
-    __m256i z4  = _mm256_set1_epi32(x[12]);
-    __m256i z9; /* useless */
-    __m256i z14 = _mm256_set1_epi32(x[14]);
-    __m256i z3  = _mm256_set1_epi32(x[15]);
-
-    __m256i orig0 = z0;
-    __m256i orig1 = z1;
-    __m256i orig2 = z2;
-    __m256i orig3 = z3;
-    __m256i orig4 = z4;
-    __m256i orig5 = z5;
-    __m256i orig6 = z6;
-    __m256i orig7 = z7;
-    __m256i orig8;
-    __m256i orig9;
-    __m256i orig10 = z10;
-    __m256i orig11 = z11;
-    __m256i orig12 = z12;
-    __m256i orig13 = z13;
-    __m256i orig14 = z14;
-    __m256i orig15 = z15;
-
-    uint32_t in8;
-    uint32_t in9;
-    int      i;
-
-    while (bytes >= 512) {
-        /* vector implementation for z8 and z9 */
-        /* faster than the naive version for 8 blocks */
-        const __m256i addv8   = _mm256_set_epi64x(3, 2, 1, 0);
-        const __m256i addv9   = _mm256_set_epi64x(7, 6, 5, 4);
-        const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
-
-        __m256i  t8, t9;
-        uint64_t in89;
-
-        in8  = x[8];
-        in9  = x[13]; /* see arrays above for the address translation */
-        in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
-
-        z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
-
-        t8 = _mm256_add_epi64(addv8, z8);
-        t9 = _mm256_add_epi64(addv9, z9);
-
-        z8 = _mm256_unpacklo_epi32(t8, t9);
-        z9 = _mm256_unpackhi_epi32(t8, t9);
-
-        t8 = _mm256_unpacklo_epi32(z8, z9);
-        t9 = _mm256_unpackhi_epi32(z8, z9);
-
-        /* required because unpack* are intra-lane */
-        z8 = _mm256_permutevar8x32_epi32(t8, permute);
-        z9 = _mm256_permutevar8x32_epi32(t9, permute);
-
-        orig8 = z8;
-        orig9 = z9;
-
-        in89 += 8;
-
-        x[8]  = in89 & 0xFFFFFFFF;
-        x[13] = (in89 >> 32) & 0xFFFFFFFF;
-
-        z5  = orig5;
-        z10 = orig10;
-        z15 = orig15;
-        z14 = orig14;
-        z3  = orig3;
-        z6  = orig6;
-        z11 = orig11;
-        z1  = orig1;
-
-        z7  = orig7;
-        z13 = orig13;
-        z2  = orig2;
-        z9  = orig9;
-        z0  = orig0;
-        z12 = orig12;
-        z4  = orig4;
-        z8  = orig8;
-
-        for (i = 0; i < ROUNDS; i += 2) {
-            /* the inner loop is a direct translation (regexp search/replace)
-             * from the amd64-xmm6 ASM */
-            __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
-                r14, r15;
-
-            y4 = z12;
-            y4 = _mm256_add_epi32(y4, z0);
-            r4 = y4;
-            y4 = _mm256_slli_epi32(y4, 7);
-            z4 = _mm256_xor_si256(z4, y4);
-            r4 = _mm256_srli_epi32(r4, 25);
-            z4 = _mm256_xor_si256(z4, r4);
-
-            y9 = z1;
-            y9 = _mm256_add_epi32(y9, z5);
-            r9 = y9;
-            y9 = _mm256_slli_epi32(y9, 7);
-            z9 = _mm256_xor_si256(z9, y9);
-            r9 = _mm256_srli_epi32(r9, 25);
-            z9 = _mm256_xor_si256(z9, r9);
-
-            y8 = z0;
-            y8 = _mm256_add_epi32(y8, z4);
-            r8 = y8;
-            y8 = _mm256_slli_epi32(y8, 9);
-            z8 = _mm256_xor_si256(z8, y8);
-            r8 = _mm256_srli_epi32(r8, 23);
-            z8 = _mm256_xor_si256(z8, r8);
-
-            y13 = z5;
-            y13 = _mm256_add_epi32(y13, z9);
-            r13 = y13;
-            y13 = _mm256_slli_epi32(y13, 9);
-            z13 = _mm256_xor_si256(z13, y13);
-            r13 = _mm256_srli_epi32(r13, 23);
-            z13 = _mm256_xor_si256(z13, r13);
-
-            y12 = z4;
-            y12 = _mm256_add_epi32(y12, z8);
-            r12 = y12;
-            y12 = _mm256_slli_epi32(y12, 13);
-            z12 = _mm256_xor_si256(z12, y12);
-            r12 = _mm256_srli_epi32(r12, 19);
-            z12 = _mm256_xor_si256(z12, r12);
-
-            y1 = z9;
-            y1 = _mm256_add_epi32(y1, z13);
-            r1 = y1;
-            y1 = _mm256_slli_epi32(y1, 13);
-            z1 = _mm256_xor_si256(z1, y1);
-            r1 = _mm256_srli_epi32(r1, 19);
-            z1 = _mm256_xor_si256(z1, r1);
-
-            y0 = z8;
-            y0 = _mm256_add_epi32(y0, z12);
-            r0 = y0;
-            y0 = _mm256_slli_epi32(y0, 18);
-            z0 = _mm256_xor_si256(z0, y0);
-            r0 = _mm256_srli_epi32(r0, 14);
-            z0 = _mm256_xor_si256(z0, r0);
-
-            y5 = z13;
-            y5 = _mm256_add_epi32(y5, z1);
-            r5 = y5;
-            y5 = _mm256_slli_epi32(y5, 18);
-            z5 = _mm256_xor_si256(z5, y5);
-            r5 = _mm256_srli_epi32(r5, 14);
-            z5 = _mm256_xor_si256(z5, r5);
-
-            y14 = z6;
-            y14 = _mm256_add_epi32(y14, z10);
-            r14 = y14;
-            y14 = _mm256_slli_epi32(y14, 7);
-            z14 = _mm256_xor_si256(z14, y14);
-            r14 = _mm256_srli_epi32(r14, 25);
-            z14 = _mm256_xor_si256(z14, r14);
-
-            y3 = z11;
-            y3 = _mm256_add_epi32(y3, z15);
-            r3 = y3;
-            y3 = _mm256_slli_epi32(y3, 7);
-            z3 = _mm256_xor_si256(z3, y3);
-            r3 = _mm256_srli_epi32(r3, 25);
-            z3 = _mm256_xor_si256(z3, r3);
-
-            y2 = z10;
-            y2 = _mm256_add_epi32(y2, z14);
-            r2 = y2;
-            y2 = _mm256_slli_epi32(y2, 9);
-            z2 = _mm256_xor_si256(z2, y2);
-            r2 = _mm256_srli_epi32(r2, 23);
-            z2 = _mm256_xor_si256(z2, r2);
-
-            y7 = z15;
-            y7 = _mm256_add_epi32(y7, z3);
-            r7 = y7;
-            y7 = _mm256_slli_epi32(y7, 9);
-            z7 = _mm256_xor_si256(z7, y7);
-            r7 = _mm256_srli_epi32(r7, 23);
-            z7 = _mm256_xor_si256(z7, r7);
-
-            y6 = z14;
-            y6 = _mm256_add_epi32(y6, z2);
-            r6 = y6;
-            y6 = _mm256_slli_epi32(y6, 13);
-            z6 = _mm256_xor_si256(z6, y6);
-            r6 = _mm256_srli_epi32(r6, 19);
-            z6 = _mm256_xor_si256(z6, r6);
-
-            y11 = z3;
-            y11 = _mm256_add_epi32(y11, z7);
-            r11 = y11;
-            y11 = _mm256_slli_epi32(y11, 13);
-            z11 = _mm256_xor_si256(z11, y11);
-            r11 = _mm256_srli_epi32(r11, 19);
-            z11 = _mm256_xor_si256(z11, r11);
-
-            y10 = z2;
-            y10 = _mm256_add_epi32(y10, z6);
-            r10 = y10;
-            y10 = _mm256_slli_epi32(y10, 18);
-            z10 = _mm256_xor_si256(z10, y10);
-            r10 = _mm256_srli_epi32(r10, 14);
-            z10 = _mm256_xor_si256(z10, r10);
-
-            y1 = z3;
-            y1 = _mm256_add_epi32(y1, z0);
-            r1 = y1;
-            y1 = _mm256_slli_epi32(y1, 7);
-            z1 = _mm256_xor_si256(z1, y1);
-            r1 = _mm256_srli_epi32(r1, 25);
-            z1 = _mm256_xor_si256(z1, r1);
-
-            y15 = z7;
-            y15 = _mm256_add_epi32(y15, z11);
-            r15 = y15;
-            y15 = _mm256_slli_epi32(y15, 18);
-            z15 = _mm256_xor_si256(z15, y15);
-            r15 = _mm256_srli_epi32(r15, 14);
-            z15 = _mm256_xor_si256(z15, r15);
-
-            y6 = z4;
-            y6 = _mm256_add_epi32(y6, z5);
-            r6 = y6;
-            y6 = _mm256_slli_epi32(y6, 7);
-            z6 = _mm256_xor_si256(z6, y6);
-            r6 = _mm256_srli_epi32(r6, 25);
-            z6 = _mm256_xor_si256(z6, r6);
-
-            y2 = z0;
-            y2 = _mm256_add_epi32(y2, z1);
-            r2 = y2;
-            y2 = _mm256_slli_epi32(y2, 9);
-            z2 = _mm256_xor_si256(z2, y2);
-            r2 = _mm256_srli_epi32(r2, 23);
-            z2 = _mm256_xor_si256(z2, r2);
-
-            y7 = z5;
-            y7 = _mm256_add_epi32(y7, z6);
-            r7 = y7;
-            y7 = _mm256_slli_epi32(y7, 9);
-            z7 = _mm256_xor_si256(z7, y7);
-            r7 = _mm256_srli_epi32(r7, 23);
-            z7 = _mm256_xor_si256(z7, r7);
-
-            y3 = z1;
-            y3 = _mm256_add_epi32(y3, z2);
-            r3 = y3;
-            y3 = _mm256_slli_epi32(y3, 13);
-            z3 = _mm256_xor_si256(z3, y3);
-            r3 = _mm256_srli_epi32(r3, 19);
-            z3 = _mm256_xor_si256(z3, r3);
-
-            y4 = z6;
-            y4 = _mm256_add_epi32(y4, z7);
-            r4 = y4;
-            y4 = _mm256_slli_epi32(y4, 13);
-            z4 = _mm256_xor_si256(z4, y4);
-            r4 = _mm256_srli_epi32(r4, 19);
-            z4 = _mm256_xor_si256(z4, r4);
-
-            y0 = z2;
-            y0 = _mm256_add_epi32(y0, z3);
-            r0 = y0;
-            y0 = _mm256_slli_epi32(y0, 18);
-            z0 = _mm256_xor_si256(z0, y0);
-            r0 = _mm256_srli_epi32(r0, 14);
-            z0 = _mm256_xor_si256(z0, r0);
-
-            y5 = z7;
-            y5 = _mm256_add_epi32(y5, z4);
-            r5 = y5;
-            y5 = _mm256_slli_epi32(y5, 18);
-            z5 = _mm256_xor_si256(z5, y5);
-            r5 = _mm256_srli_epi32(r5, 14);
-            z5 = _mm256_xor_si256(z5, r5);
-
-            y11 = z9;
-            y11 = _mm256_add_epi32(y11, z10);
-            r11 = y11;
-            y11 = _mm256_slli_epi32(y11, 7);
-            z11 = _mm256_xor_si256(z11, y11);
-            r11 = _mm256_srli_epi32(r11, 25);
-            z11 = _mm256_xor_si256(z11, r11);
-
-            y12 = z14;
-            y12 = _mm256_add_epi32(y12, z15);
-            r12 = y12;
-            y12 = _mm256_slli_epi32(y12, 7);
-            z12 = _mm256_xor_si256(z12, y12);
-            r12 = _mm256_srli_epi32(r12, 25);
-            z12 = _mm256_xor_si256(z12, r12);
-
-            y8 = z10;
-            y8 = _mm256_add_epi32(y8, z11);
-            r8 = y8;
-            y8 = _mm256_slli_epi32(y8, 9);
-            z8 = _mm256_xor_si256(z8, y8);
-            r8 = _mm256_srli_epi32(r8, 23);
-            z8 = _mm256_xor_si256(z8, r8);
-
-            y13 = z15;
-            y13 = _mm256_add_epi32(y13, z12);
-            r13 = y13;
-            y13 = _mm256_slli_epi32(y13, 9);
-            z13 = _mm256_xor_si256(z13, y13);
-            r13 = _mm256_srli_epi32(r13, 23);
-            z13 = _mm256_xor_si256(z13, r13);
-
-            y9 = z11;
-            y9 = _mm256_add_epi32(y9, z8);
-            r9 = y9;
-            y9 = _mm256_slli_epi32(y9, 13);
-            z9 = _mm256_xor_si256(z9, y9);
-            r9 = _mm256_srli_epi32(r9, 19);
-            z9 = _mm256_xor_si256(z9, r9);
-
-            y14 = z12;
-            y14 = _mm256_add_epi32(y14, z13);
-            r14 = y14;
-            y14 = _mm256_slli_epi32(y14, 13);
-            z14 = _mm256_xor_si256(z14, y14);
-            r14 = _mm256_srli_epi32(r14, 19);
-            z14 = _mm256_xor_si256(z14, r14);
-
-            y10 = z8;
-            y10 = _mm256_add_epi32(y10, z9);
-            r10 = y10;
-            y10 = _mm256_slli_epi32(y10, 18);
-            z10 = _mm256_xor_si256(z10, y10);
-            r10 = _mm256_srli_epi32(r10, 14);
-            z10 = _mm256_xor_si256(z10, r10);
-
-            y15 = z13;
-            y15 = _mm256_add_epi32(y15, z14);
-            r15 = y15;
-            y15 = _mm256_slli_epi32(y15, 18);
-            z15 = _mm256_xor_si256(z15, y15);
-            r15 = _mm256_srli_epi32(r15, 14);
-            z15 = _mm256_xor_si256(z15, r15);
-        }
+if(bytes >= 512)
+{
+  __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
+
+  /* the naive way seems as fast (if not a bit faster) than the vector way */
+  __m256i z0  = _mm256_set1_epi32(x[0]);
+  __m256i z5  = _mm256_set1_epi32(x[1]);
+  __m256i z10 = _mm256_set1_epi32(x[2]);
+  __m256i z15 = _mm256_set1_epi32(x[3]);
+  __m256i z12 = _mm256_set1_epi32(x[4]);
+  __m256i z1  = _mm256_set1_epi32(x[5]);
+  __m256i z6  = _mm256_set1_epi32(x[6]);
+  __m256i z11 = _mm256_set1_epi32(x[7]);
+  __m256i z8; /* useless */
+  __m256i z13 = _mm256_set1_epi32(x[9]);
+  __m256i z2  = _mm256_set1_epi32(x[10]);
+  __m256i z7  = _mm256_set1_epi32(x[11]);
+  __m256i z4  = _mm256_set1_epi32(x[12]);
+  __m256i z9; /* useless */
+  __m256i z14 = _mm256_set1_epi32(x[14]);
+  __m256i z3  = _mm256_set1_epi32(x[15]);
+
+  __m256i orig0 = z0;
+  __m256i orig1 = z1;
+  __m256i orig2 = z2;
+  __m256i orig3 = z3;
+  __m256i orig4 = z4;
+  __m256i orig5 = z5;
+  __m256i orig6 = z6;
+  __m256i orig7 = z7;
+  __m256i orig8;
+  __m256i orig9;
+  __m256i orig10 = z10;
+  __m256i orig11 = z11;
+  __m256i orig12 = z12;
+  __m256i orig13 = z13;
+  __m256i orig14 = z14;
+  __m256i orig15 = z15;
+
+  uint32_t in8;
+  uint32_t in9;
+  int i;
+
+  while(bytes >= 512)
+  {
+    /* vector implementation for z8 and z9 */
+    /* faster than the naive version for 8 blocks */
+    const __m256i addv8   = _mm256_set_epi64x(3, 2, 1, 0);
+    const __m256i addv9   = _mm256_set_epi64x(7, 6, 5, 4);
+    const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256i t8, t9;
+    uint64_t in89;
+
+    in8  = x[8];
+    in9  = x[13]; /* see arrays above for the address translation */
+    in89 = ((uint64_t)in8) | (((uint64_t)in9) << 32);
+
+    z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
+
+    t8 = _mm256_add_epi64(addv8, z8);
+    t9 = _mm256_add_epi64(addv9, z9);
+
+    z8 = _mm256_unpacklo_epi32(t8, t9);
+    z9 = _mm256_unpackhi_epi32(t8, t9);
+
+    t8 = _mm256_unpacklo_epi32(z8, z9);
+    t9 = _mm256_unpackhi_epi32(z8, z9);
+
+    /* required because unpack* are intra-lane */
+    z8 = _mm256_permutevar8x32_epi32(t8, permute);
+    z9 = _mm256_permutevar8x32_epi32(t9, permute);
+
+    orig8 = z8;
+    orig9 = z9;
+
+    in89 += 8;
+
+    x[8]  = in89 & 0xFFFFFFFF;
+    x[13] = (in89 >> 32) & 0xFFFFFFFF;
+
+    z5  = orig5;
+    z10 = orig10;
+    z15 = orig15;
+    z14 = orig14;
+    z3  = orig3;
+    z6  = orig6;
+    z11 = orig11;
+    z1  = orig1;
+
+    z7  = orig7;
+    z13 = orig13;
+    z2  = orig2;
+    z9  = orig9;
+    z0  = orig0;
+    z12 = orig12;
+    z4  = orig4;
+    z8  = orig8;
+
+    for(i = 0; i < ROUNDS; i += 2)
+    {
+      /* the inner loop is a direct translation (regexp search/replace)
+       * from the amd64-xmm6 ASM */
+      __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14,
+          r15;
+
+      y4 = z12;
+      y4 = _mm256_add_epi32(y4, z0);
+      r4 = y4;
+      y4 = _mm256_slli_epi32(y4, 7);
+      z4 = _mm256_xor_si256(z4, y4);
+      r4 = _mm256_srli_epi32(r4, 25);
+      z4 = _mm256_xor_si256(z4, r4);
+
+      y9 = z1;
+      y9 = _mm256_add_epi32(y9, z5);
+      r9 = y9;
+      y9 = _mm256_slli_epi32(y9, 7);
+      z9 = _mm256_xor_si256(z9, y9);
+      r9 = _mm256_srli_epi32(r9, 25);
+      z9 = _mm256_xor_si256(z9, r9);
+
+      y8 = z0;
+      y8 = _mm256_add_epi32(y8, z4);
+      r8 = y8;
+      y8 = _mm256_slli_epi32(y8, 9);
+      z8 = _mm256_xor_si256(z8, y8);
+      r8 = _mm256_srli_epi32(r8, 23);
+      z8 = _mm256_xor_si256(z8, r8);
+
+      y13 = z5;
+      y13 = _mm256_add_epi32(y13, z9);
+      r13 = y13;
+      y13 = _mm256_slli_epi32(y13, 9);
+      z13 = _mm256_xor_si256(z13, y13);
+      r13 = _mm256_srli_epi32(r13, 23);
+      z13 = _mm256_xor_si256(z13, r13);
+
+      y12 = z4;
+      y12 = _mm256_add_epi32(y12, z8);
+      r12 = y12;
+      y12 = _mm256_slli_epi32(y12, 13);
+      z12 = _mm256_xor_si256(z12, y12);
+      r12 = _mm256_srli_epi32(r12, 19);
+      z12 = _mm256_xor_si256(z12, r12);
+
+      y1 = z9;
+      y1 = _mm256_add_epi32(y1, z13);
+      r1 = y1;
+      y1 = _mm256_slli_epi32(y1, 13);
+      z1 = _mm256_xor_si256(z1, y1);
+      r1 = _mm256_srli_epi32(r1, 19);
+      z1 = _mm256_xor_si256(z1, r1);
+
+      y0 = z8;
+      y0 = _mm256_add_epi32(y0, z12);
+      r0 = y0;
+      y0 = _mm256_slli_epi32(y0, 18);
+      z0 = _mm256_xor_si256(z0, y0);
+      r0 = _mm256_srli_epi32(r0, 14);
+      z0 = _mm256_xor_si256(z0, r0);
+
+      y5 = z13;
+      y5 = _mm256_add_epi32(y5, z1);
+      r5 = y5;
+      y5 = _mm256_slli_epi32(y5, 18);
+      z5 = _mm256_xor_si256(z5, y5);
+      r5 = _mm256_srli_epi32(r5, 14);
+      z5 = _mm256_xor_si256(z5, r5);
+
+      y14 = z6;
+      y14 = _mm256_add_epi32(y14, z10);
+      r14 = y14;
+      y14 = _mm256_slli_epi32(y14, 7);
+      z14 = _mm256_xor_si256(z14, y14);
+      r14 = _mm256_srli_epi32(r14, 25);
+      z14 = _mm256_xor_si256(z14, r14);
+
+      y3 = z11;
+      y3 = _mm256_add_epi32(y3, z15);
+      r3 = y3;
+      y3 = _mm256_slli_epi32(y3, 7);
+      z3 = _mm256_xor_si256(z3, y3);
+      r3 = _mm256_srli_epi32(r3, 25);
+      z3 = _mm256_xor_si256(z3, r3);
+
+      y2 = z10;
+      y2 = _mm256_add_epi32(y2, z14);
+      r2 = y2;
+      y2 = _mm256_slli_epi32(y2, 9);
+      z2 = _mm256_xor_si256(z2, y2);
+      r2 = _mm256_srli_epi32(r2, 23);
+      z2 = _mm256_xor_si256(z2, r2);
+
+      y7 = z15;
+      y7 = _mm256_add_epi32(y7, z3);
+      r7 = y7;
+      y7 = _mm256_slli_epi32(y7, 9);
+      z7 = _mm256_xor_si256(z7, y7);
+      r7 = _mm256_srli_epi32(r7, 23);
+      z7 = _mm256_xor_si256(z7, r7);
+
+      y6 = z14;
+      y6 = _mm256_add_epi32(y6, z2);
+      r6 = y6;
+      y6 = _mm256_slli_epi32(y6, 13);
+      z6 = _mm256_xor_si256(z6, y6);
+      r6 = _mm256_srli_epi32(r6, 19);
+      z6 = _mm256_xor_si256(z6, r6);
+
+      y11 = z3;
+      y11 = _mm256_add_epi32(y11, z7);
+      r11 = y11;
+      y11 = _mm256_slli_epi32(y11, 13);
+      z11 = _mm256_xor_si256(z11, y11);
+      r11 = _mm256_srli_epi32(r11, 19);
+      z11 = _mm256_xor_si256(z11, r11);
+
+      y10 = z2;
+      y10 = _mm256_add_epi32(y10, z6);
+      r10 = y10;
+      y10 = _mm256_slli_epi32(y10, 18);
+      z10 = _mm256_xor_si256(z10, y10);
+      r10 = _mm256_srli_epi32(r10, 14);
+      z10 = _mm256_xor_si256(z10, r10);
+
+      y1 = z3;
+      y1 = _mm256_add_epi32(y1, z0);
+      r1 = y1;
+      y1 = _mm256_slli_epi32(y1, 7);
+      z1 = _mm256_xor_si256(z1, y1);
+      r1 = _mm256_srli_epi32(r1, 25);
+      z1 = _mm256_xor_si256(z1, r1);
+
+      y15 = z7;
+      y15 = _mm256_add_epi32(y15, z11);
+      r15 = y15;
+      y15 = _mm256_slli_epi32(y15, 18);
+      z15 = _mm256_xor_si256(z15, y15);
+      r15 = _mm256_srli_epi32(r15, 14);
+      z15 = _mm256_xor_si256(z15, r15);
+
+      y6 = z4;
+      y6 = _mm256_add_epi32(y6, z5);
+      r6 = y6;
+      y6 = _mm256_slli_epi32(y6, 7);
+      z6 = _mm256_xor_si256(z6, y6);
+      r6 = _mm256_srli_epi32(r6, 25);
+      z6 = _mm256_xor_si256(z6, r6);
+
+      y2 = z0;
+      y2 = _mm256_add_epi32(y2, z1);
+      r2 = y2;
+      y2 = _mm256_slli_epi32(y2, 9);
+      z2 = _mm256_xor_si256(z2, y2);
+      r2 = _mm256_srli_epi32(r2, 23);
+      z2 = _mm256_xor_si256(z2, r2);
+
+      y7 = z5;
+      y7 = _mm256_add_epi32(y7, z6);
+      r7 = y7;
+      y7 = _mm256_slli_epi32(y7, 9);
+      z7 = _mm256_xor_si256(z7, y7);
+      r7 = _mm256_srli_epi32(r7, 23);
+      z7 = _mm256_xor_si256(z7, r7);
+
+      y3 = z1;
+      y3 = _mm256_add_epi32(y3, z2);
+      r3 = y3;
+      y3 = _mm256_slli_epi32(y3, 13);
+      z3 = _mm256_xor_si256(z3, y3);
+      r3 = _mm256_srli_epi32(r3, 19);
+      z3 = _mm256_xor_si256(z3, r3);
+
+      y4 = z6;
+      y4 = _mm256_add_epi32(y4, z7);
+      r4 = y4;
+      y4 = _mm256_slli_epi32(y4, 13);
+      z4 = _mm256_xor_si256(z4, y4);
+      r4 = _mm256_srli_epi32(r4, 19);
+      z4 = _mm256_xor_si256(z4, r4);
+
+      y0 = z2;
+      y0 = _mm256_add_epi32(y0, z3);
+      r0 = y0;
+      y0 = _mm256_slli_epi32(y0, 18);
+      z0 = _mm256_xor_si256(z0, y0);
+      r0 = _mm256_srli_epi32(r0, 14);
+      z0 = _mm256_xor_si256(z0, r0);
+
+      y5 = z7;
+      y5 = _mm256_add_epi32(y5, z4);
+      r5 = y5;
+      y5 = _mm256_slli_epi32(y5, 18);
+      z5 = _mm256_xor_si256(z5, y5);
+      r5 = _mm256_srli_epi32(r5, 14);
+      z5 = _mm256_xor_si256(z5, r5);
+
+      y11 = z9;
+      y11 = _mm256_add_epi32(y11, z10);
+      r11 = y11;
+      y11 = _mm256_slli_epi32(y11, 7);
+      z11 = _mm256_xor_si256(z11, y11);
+      r11 = _mm256_srli_epi32(r11, 25);
+      z11 = _mm256_xor_si256(z11, r11);
+
+      y12 = z14;
+      y12 = _mm256_add_epi32(y12, z15);
+      r12 = y12;
+      y12 = _mm256_slli_epi32(y12, 7);
+      z12 = _mm256_xor_si256(z12, y12);
+      r12 = _mm256_srli_epi32(r12, 25);
+      z12 = _mm256_xor_si256(z12, r12);
+
+      y8 = z10;
+      y8 = _mm256_add_epi32(y8, z11);
+      r8 = y8;
+      y8 = _mm256_slli_epi32(y8, 9);
+      z8 = _mm256_xor_si256(z8, y8);
+      r8 = _mm256_srli_epi32(r8, 23);
+      z8 = _mm256_xor_si256(z8, r8);
+
+      y13 = z15;
+      y13 = _mm256_add_epi32(y13, z12);
+      r13 = y13;
+      y13 = _mm256_slli_epi32(y13, 9);
+      z13 = _mm256_xor_si256(z13, y13);
+      r13 = _mm256_srli_epi32(r13, 23);
+      z13 = _mm256_xor_si256(z13, r13);
+
+      y9 = z11;
+      y9 = _mm256_add_epi32(y9, z8);
+      r9 = y9;
+      y9 = _mm256_slli_epi32(y9, 13);
+      z9 = _mm256_xor_si256(z9, y9);
+      r9 = _mm256_srli_epi32(r9, 19);
+      z9 = _mm256_xor_si256(z9, r9);
+
+      y14 = z12;
+      y14 = _mm256_add_epi32(y14, z13);
+      r14 = y14;
+      y14 = _mm256_slli_epi32(y14, 13);
+      z14 = _mm256_xor_si256(z14, y14);
+      r14 = _mm256_srli_epi32(r14, 19);
+      z14 = _mm256_xor_si256(z14, r14);
+
+      y10 = z8;
+      y10 = _mm256_add_epi32(y10, z9);
+      r10 = y10;
+      y10 = _mm256_slli_epi32(y10, 18);
+      z10 = _mm256_xor_si256(z10, y10);
+      r10 = _mm256_srli_epi32(r10, 14);
+      z10 = _mm256_xor_si256(z10, r10);
+
+      y15 = z13;
+      y15 = _mm256_add_epi32(y15, z14);
+      r15 = y15;
+      y15 = _mm256_slli_epi32(y15, 18);
+      z15 = _mm256_xor_si256(z15, y15);
+      r15 = _mm256_srli_epi32(r15, 14);
+      z15 = _mm256_xor_si256(z15, r15);
+    }

 /* store data ; this macro first transpose data in-registers, and then store
 * them in memory. much faster with icc. */
-#define ONEQUAD_TRANSPOSE(A, B, C, D)                              \
-    {                                                              \
-        __m128i t0, t1, t2, t3;                                    \
-        z##A = _mm256_add_epi32(z##A, orig##A);                    \
-        z##B = _mm256_add_epi32(z##B, orig##B);                    \
-        z##C = _mm256_add_epi32(z##C, orig##C);                    \
-        z##D = _mm256_add_epi32(z##D, orig##D);                    \
-        y##A = _mm256_unpacklo_epi32(z##A, z##B);                  \
-        y##B = _mm256_unpacklo_epi32(z##C, z##D);                  \
-        y##C = _mm256_unpackhi_epi32(z##A, z##B);                  \
-        y##D = _mm256_unpackhi_epi32(z##C, z##D);                  \
-        z##A = _mm256_unpacklo_epi64(y##A, y##B);                  \
-        z##B = _mm256_unpackhi_epi64(y##A, y##B);                  \
-        z##C = _mm256_unpacklo_epi64(y##C, y##D);                  \
-        z##D = _mm256_unpackhi_epi64(y##C, y##D);                  \
-        t0   = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0),    \
-                           _mm_loadu_si128((__m128i*) (m + 0)));   \
-        _mm_storeu_si128((__m128i*) (c + 0), t0);                  \
-        t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0),      \
-                           _mm_loadu_si128((__m128i*) (m + 64)));  \
-        _mm_storeu_si128((__m128i*) (c + 64), t1);                 \
-        t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0),      \
-                           _mm_loadu_si128((__m128i*) (m + 128))); \
-        _mm_storeu_si128((__m128i*) (c + 128), t2);                \
-        t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0),      \
-                           _mm_loadu_si128((__m128i*) (m + 192))); \
-        _mm_storeu_si128((__m128i*) (c + 192), t3);                \
-        t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1),      \
-                           _mm_loadu_si128((__m128i*) (m + 256))); \
-        _mm_storeu_si128((__m128i*) (c + 256), t0);                \
-        t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1),      \
-                           _mm_loadu_si128((__m128i*) (m + 320))); \
-        _mm_storeu_si128((__m128i*) (c + 320), t1);                \
-        t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1),      \
-                           _mm_loadu_si128((__m128i*) (m + 384))); \
-        _mm_storeu_si128((__m128i*) (c + 384), t2);                \
-        t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1),      \
-                           _mm_loadu_si128((__m128i*) (m + 448))); \
-        _mm_storeu_si128((__m128i*) (c + 448), t3);                \
-    }
+#define ONEQUAD_TRANSPOSE(A, B, C, D)                         \
+  {                                                           \
+    __m128i t0, t1, t2, t3;                                   \
+    z##A = _mm256_add_epi32(z##A, orig##A);                   \
+    z##B = _mm256_add_epi32(z##B, orig##B);                   \
+    z##C = _mm256_add_epi32(z##C, orig##C);                   \
+    z##D = _mm256_add_epi32(z##D, orig##D);                   \
+    y##A = _mm256_unpacklo_epi32(z##A, z##B);                 \
+    y##B = _mm256_unpacklo_epi32(z##C, z##D);                 \
+    y##C = _mm256_unpackhi_epi32(z##A, z##B);                 \
+    y##D = _mm256_unpackhi_epi32(z##C, z##D);                 \
+    z##A = _mm256_unpacklo_epi64(y##A, y##B);                 \
+    z##B = _mm256_unpackhi_epi64(y##A, y##B);                 \
+    z##C = _mm256_unpacklo_epi64(y##C, y##D);                 \
+    z##D = _mm256_unpackhi_epi64(y##C, y##D);                 \
+    t0   = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0),   \
+                       _mm_loadu_si128((__m128i*)(m + 0))); \
+    _mm_storeu_si128((__m128i*)(c + 0), t0);                  \
+    t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0),     \
+                       _mm_loadu_si128((__m128i*)(m + 64)));  \
+    _mm_storeu_si128((__m128i*)(c + 64), t1);                 \
+    t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0),     \
+                       _mm_loadu_si128((__m128i*)(m + 128))); \
+    _mm_storeu_si128((__m128i*)(c + 128), t2);                \
+    t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0),     \
+                       _mm_loadu_si128((__m128i*)(m + 192))); \
+    _mm_storeu_si128((__m128i*)(c + 192), t3);                \
+    t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1),     \
+                       _mm_loadu_si128((__m128i*)(m + 256))); \
+    _mm_storeu_si128((__m128i*)(c + 256), t0);                \
+    t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1),     \
+                       _mm_loadu_si128((__m128i*)(m + 320))); \
+    _mm_storeu_si128((__m128i*)(c + 320), t1);                \
+    t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1),     \
+                       _mm_loadu_si128((__m128i*)(m + 384))); \
+    _mm_storeu_si128((__m128i*)(c + 384), t2);                \
+    t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1),     \
+                       _mm_loadu_si128((__m128i*)(m + 448))); \
+    _mm_storeu_si128((__m128i*)(c + 448), t3);                \
+  }

 #define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)

-#define ONEQUAD_UNPCK(A, B, C, D)                 \
-    {                                             \
-        z##A = _mm256_add_epi32(z##A, orig##A);   \
-        z##B = _mm256_add_epi32(z##B, orig##B);   \
-        z##C = _mm256_add_epi32(z##C, orig##C);   \
-        z##D = _mm256_add_epi32(z##D, orig##D);   \
-        y##A = _mm256_unpacklo_epi32(z##A, z##B); \
-        y##B = _mm256_unpacklo_epi32(z##C, z##D); \
-        y##C = _mm256_unpackhi_epi32(z##A, z##B); \
-        y##D = _mm256_unpackhi_epi32(z##C, z##D); \
-        z##A = _mm256_unpacklo_epi64(y##A, y##B); \
-        z##B = _mm256_unpackhi_epi64(y##A, y##B); \
-        z##C = _mm256_unpacklo_epi64(y##C, y##D); \
-        z##D = _mm256_unpackhi_epi64(y##C, y##D); \
-    }
-
-#define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                                     \
-    {                                                                           \
-        ONEQUAD_UNPCK(A, B, C, D);                                              \
-        ONEQUAD_UNPCK(A2, B2, C2, D2);                                          \
-        y##A  = _mm256_permute2x128_si256(z##A, z##A2, 0x20);                   \
-        y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31);                   \
-        y##B  = _mm256_permute2x128_si256(z##B, z##B2, 0x20);                   \
-        y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31);                   \
-        y##C  = _mm256_permute2x128_si256(z##C, z##C2, 0x20);                   \
-        y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31);                   \
-        y##D  = _mm256_permute2x128_si256(z##D, z##D2, 0x20);                   \
-        y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31);                   \
-        y##A  = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*) (m + 0))); \
-        y##B =                                                                  \
-            _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*) (m + 64)));    \
-        y##C =                                                                  \
-            _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*) (m + 128)));   \
-        y##D =                                                                  \
-            _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*) (m + 192)));   \
-        y##A2 =                                                                 \
-            _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*) (m + 256)));  \
-        y##B2 =                                                                 \
-            _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*) (m + 320)));  \
-        y##C2 =                                                                 \
-            _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*) (m + 384)));  \
-        y##D2 =                                                                 \
-            _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*) (m + 448)));  \
-        _mm256_storeu_si256((__m256i*) (c + 0), y##A);                          \
-        _mm256_storeu_si256((__m256i*) (c + 64), y##B);                         \
-        _mm256_storeu_si256((__m256i*) (c + 128), y##C);                        \
-        _mm256_storeu_si256((__m256i*) (c + 192), y##D);                        \
-        _mm256_storeu_si256((__m256i*) (c + 256), y##A2);                       \
-        _mm256_storeu_si256((__m256i*) (c + 320), y##B2);                       \
-        _mm256_storeu_si256((__m256i*) (c + 384), y##C2);                       \
-        _mm256_storeu_si256((__m256i*) (c + 448), y##D2);                       \
-    }
-
-        ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
-        m += 32;
-        c += 32;
-        ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
-        m -= 32;
-        c -= 32;
+#define ONEQUAD_UNPCK(A, B, C, D)             \
+  {                                           \
+    z##A = _mm256_add_epi32(z##A, orig##A);   \
+    z##B = _mm256_add_epi32(z##B, orig##B);   \
+    z##C = _mm256_add_epi32(z##C, orig##C);   \
+    z##D = _mm256_add_epi32(z##D, orig##D);   \
+    y##A = _mm256_unpacklo_epi32(z##A, z##B); \
+    y##B = _mm256_unpacklo_epi32(z##C, z##D); \
+    y##C = _mm256_unpackhi_epi32(z##A, z##B); \
+    y##D = _mm256_unpackhi_epi32(z##C, z##D); \
+    z##A = _mm256_unpacklo_epi64(y##A, y##B); \
+    z##B = _mm256_unpackhi_epi64(y##A, y##B); \
+    z##C = _mm256_unpacklo_epi64(y##C, y##D); \
+    z##D = _mm256_unpackhi_epi64(y##C, y##D); \
+  }
+
+#define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                                   \
+  {                                                                           \
+    ONEQUAD_UNPCK(A, B, C, D);                                                \
+    ONEQUAD_UNPCK(A2, B2, C2, D2);                                            \
+    y##A  = _mm256_permute2x128_si256(z##A, z##A2, 0x20);                     \
+    y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31);                     \
+    y##B  = _mm256_permute2x128_si256(z##B, z##B2, 0x20);                     \
+    y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31);                     \
+    y##C  = _mm256_permute2x128_si256(z##C, z##C2, 0x20);                     \
+    y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31);                     \
+    y##D  = _mm256_permute2x128_si256(z##D, z##D2, 0x20);                     \
+    y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31);                     \
+    y##A  = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*)(m + 0)));    \
+    y##B  = _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*)(m + 64)));   \
+    y##C  = _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*)(m + 128)));  \
+    y##D  = _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*)(m + 192)));  \
+    y##A2 = _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*)(m + 256))); \
+    y##B2 = _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*)(m + 320))); \
+    y##C2 = _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*)(m + 384))); \
+    y##D2 = _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*)(m + 448))); \
+    _mm256_storeu_si256((__m256i*)(c + 0), y##A);                             \
+    _mm256_storeu_si256((__m256i*)(c + 64), y##B);                            \
+    _mm256_storeu_si256((__m256i*)(c + 128), y##C);                           \
+    _mm256_storeu_si256((__m256i*)(c + 192), y##D);                           \
+    _mm256_storeu_si256((__m256i*)(c + 256), y##A2);                          \
+    _mm256_storeu_si256((__m256i*)(c + 320), y##B2);                          \
+    _mm256_storeu_si256((__m256i*)(c + 384), y##C2);                          \
+    _mm256_storeu_si256((__m256i*)(c + 448), y##D2);                          \
+  }
+
+    ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
+    m += 32;
+    c += 32;
+    ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
+    m -= 32;
+    c -= 32;

 #undef ONEQUAD
 #undef ONEQUAD_TRANSPOSE
 #undef ONEQUAD_UNPCK
 #undef ONEOCTO

-        bytes -= 512;
-        c += 512;
-        m += 512;
-    }
+    bytes -= 512;
+    c += 512;
+    m += 512;
+  }
 }