mirror of
https://github.com/RsyncProject/rsync.git
synced 2025-12-23 23:28:17 -05:00
- Make the SIMD ASM code off by default. Use configure --enable-simd-asm to enable. - Allow MD5 ASM code to be requested even when OpenSSL is handling MD4 checksums. Use configure --enable-md5-asm to enable.
178 lines
3.3 KiB
ArmAsm
178 lines
3.3 KiB
ArmAsm
#include "config.h"
|
|
|
|
#ifdef USE_ROLL_ASM /* { */
|
|
|
|
#define CHAR_OFFSET 0 /* Keep this the same as rsync.h, which isn't likely to change. */
|
|
|
|
#ifdef __APPLE__
|
|
#define get_checksum1_avx2_asm _get_checksum1_avx2_asm
|
|
#endif
|
|
|
|
.intel_syntax noprefix
|
|
.text
|
|
|
|
.p2align 5
|
|
.globl get_checksum1_avx2_asm
|
|
|
|
# rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2
|
|
get_checksum1_avx2_asm:
|
|
vmovd xmm6,[rcx] # load *ps1
|
|
lea eax, [rsi-128] # at least 128 bytes to process?
|
|
cmp edx, eax
|
|
jg .exit
|
|
lea rax, .mul_T2[rip]
|
|
vmovntdqa ymm7, [rax] # load T2 multiplication constants
|
|
vmovntdqa ymm12,[rax+32]# from memory.
|
|
vpcmpeqd ymm15, ymm15, ymm15 # set all elements to -1.
|
|
|
|
#if CHAR_OFFSET != 0
|
|
mov eax, 32*CHAR_OFFSET
|
|
vmovd xmm10, eax
|
|
vpbroadcastd ymm10, xmm10
|
|
mov eax, 528*CHAR_OFFSET
|
|
vmovd xmm13, eax
|
|
vpbroadcastd ymm13, xmm13
|
|
#endif
|
|
vpabsb ymm15, ymm15 # set all byte size elements to 1.
|
|
add rdi, rdx
|
|
vmovdqu ymm2, [rdi] # preload the first 64 bytes.
|
|
vmovdqu ymm3, [rdi+32]
|
|
and esi, ~63 # only needed during final reduction,
|
|
# done here to avoid a longer nop for
|
|
# alignment below.
|
|
add edx, esi
|
|
shr rsi, 6 # longer opcode for alignment
|
|
add rdi, 64
|
|
vpxor xmm1, xmm1, xmm1 # reset both partial sums accumulators.
|
|
vpxor xmm4, xmm4, xmm4
|
|
mov eax, [r8]
|
|
.p2align 4 # should fit into the LSD allocation queue.
|
|
.loop:
|
|
vpmaddubsw ymm0, ymm15, ymm2 # s1 partial sums
|
|
vpmaddubsw ymm5, ymm15, ymm3
|
|
vmovdqu ymm8, [rdi] # preload the next
|
|
vmovdqu ymm9, [rdi+32] # 64 bytes.
|
|
add rdi, 64
|
|
vpaddd ymm4, ymm4, ymm6
|
|
vpaddw ymm5, ymm5, ymm0
|
|
vpsrld ymm0, ymm5, 16
|
|
vpaddw ymm5, ymm0, ymm5
|
|
vpaddd ymm6, ymm5, ymm6
|
|
vpmaddubsw ymm2, ymm7, ymm2 # s2 partial sums
|
|
vpmaddubsw ymm3, ymm12, ymm3
|
|
prefetcht0 [rdi+384] # prefetch 6 cachelines ahead.
|
|
vpaddw ymm3, ymm2, ymm3
|
|
vpsrldq ymm2, ymm3, 2
|
|
vpaddd ymm3, ymm2, ymm3
|
|
vpaddd ymm1, ymm1, ymm3
|
|
|
|
#if CHAR_OFFSET != 0
|
|
vpaddd ymm6, ymm10, ymm6 # 32*CHAR_OFFSET
|
|
vpaddd ymm1, ymm13, ymm1 # 528*CHAR_OFFSET
|
|
#endif
|
|
vmovdqa ymm2, ymm8 # move the next 64 bytes
|
|
vmovdqa ymm3, ymm9 # into the right registers
|
|
sub esi, 1
|
|
jnz .loop
|
|
|
|
# now we reduce the partial sums.
|
|
vpslld ymm3, ymm4, 6
|
|
vpsrldq ymm2, ymm6, 4
|
|
|
|
vpaddd ymm0, ymm3, ymm1
|
|
vpaddd ymm6, ymm2, ymm6
|
|
vpsrlq ymm3, ymm0, 32
|
|
|
|
vpsrldq ymm2, ymm6, 8
|
|
vpaddd ymm0, ymm3, ymm0
|
|
vpsrldq ymm3, ymm0, 8
|
|
vpaddd ymm6, ymm2, ymm6
|
|
vpaddd ymm0, ymm3, ymm0
|
|
vextracti128 xmm2, ymm6, 0x1
|
|
vextracti128 xmm1, ymm0, 0x1
|
|
vpaddd xmm6, xmm2, xmm6
|
|
vmovd [rcx], xmm6
|
|
vpaddd xmm1, xmm1, xmm0
|
|
vmovd ecx, xmm1
|
|
add eax, ecx
|
|
mov [r8], eax
|
|
.exit:
|
|
vzeroupper
|
|
mov eax, edx
|
|
ret
|
|
|
|
#ifdef __APPLE__
|
|
.data
|
|
.align 6
|
|
#else
|
|
.section .rodata
|
|
.p2align 6
|
|
#endif
|
|
.mul_T2:
|
|
.byte 64
|
|
.byte 63
|
|
.byte 62
|
|
.byte 61
|
|
.byte 60
|
|
.byte 59
|
|
.byte 58
|
|
.byte 57
|
|
.byte 56
|
|
.byte 55
|
|
.byte 54
|
|
.byte 53
|
|
.byte 52
|
|
.byte 51
|
|
.byte 50
|
|
.byte 49
|
|
.byte 48
|
|
.byte 47
|
|
.byte 46
|
|
.byte 45
|
|
.byte 44
|
|
.byte 43
|
|
.byte 42
|
|
.byte 41
|
|
.byte 40
|
|
.byte 39
|
|
.byte 38
|
|
.byte 37
|
|
.byte 36
|
|
.byte 35
|
|
.byte 34
|
|
.byte 33
|
|
.byte 32
|
|
.byte 31
|
|
.byte 30
|
|
.byte 29
|
|
.byte 28
|
|
.byte 27
|
|
.byte 26
|
|
.byte 25
|
|
.byte 24
|
|
.byte 23
|
|
.byte 22
|
|
.byte 21
|
|
.byte 20
|
|
.byte 19
|
|
.byte 18
|
|
.byte 17
|
|
.byte 16
|
|
.byte 15
|
|
.byte 14
|
|
.byte 13
|
|
.byte 12
|
|
.byte 11
|
|
.byte 10
|
|
.byte 9
|
|
.byte 8
|
|
.byte 7
|
|
.byte 6
|
|
.byte 5
|
|
.byte 4
|
|
.byte 3
|
|
.byte 2
|
|
.byte 1
|
|
|
|
#endif /* } USE_ROLL_ASM */
|