You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
187 lines
3.4 KiB
ArmAsm
187 lines
3.4 KiB
ArmAsm
// +build gc,!purego,!noasm
|
|
|
|
#include "textflag.h"
|
|
|
|
// Register allocation.
|
|
#define digest R1
|
|
#define h R2 // Return value.
|
|
#define p R3 // Input pointer.
|
|
#define len R4
|
|
#define nblocks R5 // len / 32.
|
|
#define prime1 R7
|
|
#define prime2 R8
|
|
#define prime3 R9
|
|
#define prime4 R10
|
|
#define prime5 R11
|
|
#define v1 R12
|
|
#define v2 R13
|
|
#define v3 R14
|
|
#define v4 R15
|
|
#define x1 R20
|
|
#define x2 R21
|
|
#define x3 R22
|
|
#define x4 R23
|
|
|
|
#define round(acc, x) \
|
|
MADD prime2, acc, x, acc \
|
|
ROR $64-31, acc \
|
|
MUL prime1, acc \
|
|
|
|
// x = round(0, x).
|
|
#define round0(x) \
|
|
MUL prime2, x \
|
|
ROR $64-31, x \
|
|
MUL prime1, x \
|
|
|
|
#define mergeRound(x) \
|
|
round0(x) \
|
|
EOR x, h \
|
|
MADD h, prime4, prime1, h \
|
|
|
|
// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
|
|
#define blocksLoop() \
|
|
LSR $5, len, nblocks \
|
|
PCALIGN $16 \
|
|
loop: \
|
|
LDP.P 32(p), (x1, x2) \
|
|
round(v1, x1) \
|
|
LDP -16(p), (x3, x4) \
|
|
round(v2, x2) \
|
|
SUB $1, nblocks \
|
|
round(v3, x3) \
|
|
round(v4, x4) \
|
|
CBNZ nblocks, loop \
|
|
|
|
// The primes are repeated here to ensure that they're stored
|
|
// in a contiguous array, so we can load them with LDP.
|
|
DATA primes<> +0(SB)/8, $11400714785074694791
|
|
DATA primes<> +8(SB)/8, $14029467366897019727
|
|
DATA primes<>+16(SB)/8, $1609587929392839161
|
|
DATA primes<>+24(SB)/8, $9650029242287828579
|
|
DATA primes<>+32(SB)/8, $2870177450012600261
|
|
GLOBL primes<>(SB), NOPTR+RODATA, $40
|
|
|
|
// func Sum64(b []byte) uint64
|
|
TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
|
|
LDP b_base+0(FP), (p, len)
|
|
|
|
LDP primes<> +0(SB), (prime1, prime2)
|
|
LDP primes<>+16(SB), (prime3, prime4)
|
|
MOVD primes<>+32(SB), prime5
|
|
|
|
CMP $32, len
|
|
CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
|
|
BLO afterLoop
|
|
|
|
ADD prime1, prime2, v1
|
|
MOVD prime2, v2
|
|
MOVD $0, v3
|
|
NEG prime1, v4
|
|
|
|
blocksLoop()
|
|
|
|
ROR $64-1, v1, x1
|
|
ROR $64-7, v2, x2
|
|
ADD x1, x2
|
|
ROR $64-12, v3, x3
|
|
ROR $64-18, v4, x4
|
|
ADD x3, x4
|
|
ADD x2, x4, h
|
|
|
|
mergeRound(v1)
|
|
mergeRound(v2)
|
|
mergeRound(v3)
|
|
mergeRound(v4)
|
|
|
|
afterLoop:
|
|
ADD len, h
|
|
|
|
TBZ $4, len, try8
|
|
LDP.P 16(p), (x1, x2)
|
|
|
|
round0(x1)
|
|
ROR $64-27, h
|
|
EOR x1 @> 64-27, h, h
|
|
MADD h, prime4, prime1, h
|
|
|
|
round0(x2)
|
|
ROR $64-27, h
|
|
EOR x2 @> 64-27, h
|
|
MADD h, prime4, prime1, h
|
|
|
|
try8:
|
|
TBZ $3, len, try4
|
|
MOVD.P 8(p), x1
|
|
|
|
round0(x1)
|
|
ROR $64-27, h
|
|
EOR x1 @> 64-27, h
|
|
MADD h, prime4, prime1, h
|
|
|
|
try4:
|
|
TBZ $2, len, try2
|
|
MOVWU.P 4(p), x2
|
|
|
|
MUL prime1, x2
|
|
ROR $64-23, h
|
|
EOR x2 @> 64-23, h
|
|
MADD h, prime3, prime2, h
|
|
|
|
try2:
|
|
TBZ $1, len, try1
|
|
MOVHU.P 2(p), x3
|
|
AND $255, x3, x1
|
|
LSR $8, x3, x2
|
|
|
|
MUL prime5, x1
|
|
ROR $64-11, h
|
|
EOR x1 @> 64-11, h
|
|
MUL prime1, h
|
|
|
|
MUL prime5, x2
|
|
ROR $64-11, h
|
|
EOR x2 @> 64-11, h
|
|
MUL prime1, h
|
|
|
|
try1:
|
|
TBZ $0, len, end
|
|
MOVBU (p), x4
|
|
|
|
MUL prime5, x4
|
|
ROR $64-11, h
|
|
EOR x4 @> 64-11, h
|
|
MUL prime1, h
|
|
|
|
end:
|
|
EOR h >> 33, h
|
|
MUL prime2, h
|
|
EOR h >> 29, h
|
|
MUL prime3, h
|
|
EOR h >> 32, h
|
|
|
|
MOVD h, ret+24(FP)
|
|
RET
|
|
|
|
// func writeBlocks(d *Digest, b []byte) int
|
|
//
|
|
// Assumes len(b) >= 32.
|
|
TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
|
|
LDP primes<>(SB), (prime1, prime2)
|
|
|
|
// Load state. Assume v[1-4] are stored contiguously.
|
|
MOVD d+0(FP), digest
|
|
LDP 0(digest), (v1, v2)
|
|
LDP 16(digest), (v3, v4)
|
|
|
|
LDP b_base+8(FP), (p, len)
|
|
|
|
blocksLoop()
|
|
|
|
// Store updated state.
|
|
STP (v1, v2), 0(digest)
|
|
STP (v3, v4), 16(digest)
|
|
|
|
BIC $31, len
|
|
MOVD len, ret+32(FP)
|
|
RET
|