x86-64 AVX2 assemby implemenation of get_checksum1() (#174)

This commit is contained in:
Shark64
2021-09-27 03:16:55 +02:00
committed by GitHub
parent 97f4d48a07
commit 265785b7b9
4 changed files with 184 additions and 123 deletions

View File

@@ -29,7 +29,7 @@ SHELL=/bin/sh
.SUFFIXES:
.SUFFIXES: .c .o
SIMD_x86_64=simd-checksum-x86_64.o
SIMD_x86_64=simd-checksum-x86_64.o simd-checksum-avx2.o
ASM_x86_64=lib/md5-asm-x86_64.o
GENFILES=configure.sh aclocal.m4 config.h.in rsync.1 rsync.1.html \
@@ -140,6 +140,9 @@ git-version.h: mkgitver $(wildcard $(srcdir)/.git/logs/HEAD)
simd-checksum-x86_64.o: simd-checksum-x86_64.cpp
@$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-checksum-x86_64.cpp
simd-checksum-avx2.o: simd-checksum-avx2.S
@$(srcdir)/cmdormsg disable-asm $(CC) $(CFLAGS) --include=$(srcdir)/rsync.h -DAVX2_ASM -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/simd-checksum-avx2.S
lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.S config.h lib/md-defines.h
@$(srcdir)/cmdormsg disable-asm $(CC) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/lib/md5-asm-x86_64.S

View File

@@ -18,6 +18,11 @@
* with this program; if not, visit the http://fsf.org website.
*/
/* a non-zero CHAR_OFFSET makes the rolling sum stronger, but is
incompatible with older versions :-( */
#define CHAR_OFFSET 0
#ifndef AVX2_ASM /* do not include the rest of file for assembly */
#define False 0
#define True 1
#define Unset (-1) /* Our BOOL values are always an int. */
@@ -38,9 +43,6 @@
#define BACKUP_SUFFIX "~"
/* a non-zero CHAR_OFFSET makes the rolling sum stronger, but is
incompatible with older versions :-( */
#define CHAR_OFFSET 0
/* These flags are only used during the flist transfer. */
@@ -1472,3 +1474,4 @@ const char *get_panic_action(void);
fprintf(stderr, "%s in %s at line %d\n", msg, __FILE__, __LINE__); \
exit_cleanup(RERR_UNSUPPORTED); \
} while (0)
#endif /* AVX2_ASM */

169
simd-checksum-avx2.S Normal file
View File

@@ -0,0 +1,169 @@
#ifdef __APPLE__
#define get_checksum1_avx2 _get_checksum1_avx2
#endif
.intel_syntax noprefix
.text
.p2align 5
.globl get_checksum1_avx2
# rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2
get_checksum1_avx2:
vmovd xmm6,[rcx] # load *ps1
lea eax, [rsi-128] # at least 128 bytes to process?
cmp edx, eax
jg .exit
lea rax, .mul_T2[rip]
vmovntdqa ymm7, [rax] # load T2 multiplication constants
vmovntdqa ymm12,[rax+32]# from memory.
vpcmpeqd ymm15, ymm15, ymm15 # set all elements to -1.
#if CHAR_OFFSET != 0
mov eax, 32*CHAR_OFFSET
vmovd xmm10, eax
vpbroadcastd ymm10, xmm10
mov eax, 528*CHAR_OFFSET
vmovd xmm13, eax
vpbroadcastd ymm13, xmm13
#endif
vpabsb ymm15, ymm15 # set all byte size elements to 1.
add rdi, rdx
vmovdqu ymm2, [rdi] # preload the first 64 bytes.
vmovdqu ymm3, [rdi+32]
and esi, ~63 # only needed during final reduction,
# done here to avoid a longer nop for
# alignment below.
add edx, esi
shr rsi, 6 # longer opcode for alignment
add rdi, 64
vpxor xmm1, xmm1, xmm1 # reset both partial sums accumulators.
vpxor xmm4, xmm4, xmm4
mov eax, [r8]
.p2align 4 # should fit into the LSD allocation queue.
.loop:
vpmaddubsw ymm0, ymm15, ymm2 # s1 partial sums
vpmaddubsw ymm5, ymm15, ymm3
vmovdqu ymm8, [rdi] # preload the next
vmovdqu ymm9, [rdi+32] # 64 bytes.
add rdi, 64
vpaddd ymm4, ymm4, ymm6
vpaddw ymm5, ymm5, ymm0
vpsrld ymm0, ymm5, 16
vpaddw ymm5, ymm0, ymm5
vpaddd ymm6, ymm5, ymm6
vpmaddubsw ymm2, ymm7, ymm2 # s2 partial sums
vpmaddubsw ymm3, ymm12, ymm3
prefetcht0 [rdi+384] # prefetch 6 cachelines ahead.
vpaddw ymm3, ymm2, ymm3
vpsrldq ymm2, ymm3, 2
vpaddd ymm3, ymm2, ymm3
vpaddd ymm1, ymm1, ymm3
#if CHAR_OFFSET != 0
vpaddd ymm6, ymm10, ymm6 # 32*CHAR_OFFSET
vpaddd ymm1, ymm13, ymm1 # 528*CHAR_OFFSET
#endif
vmovdqa ymm2, ymm8 # move the next 64 bytes
vmovdqa ymm3, ymm9 # into the right registers
sub esi, 1
jnz .loop
# now we reduce the partial sums.
vpslld ymm3, ymm4, 6
vpsrldq ymm2, ymm6, 4
vpaddd ymm0, ymm3, ymm1
vpaddd ymm6, ymm2, ymm6
vpsrlq ymm3, ymm0, 32
vpsrldq ymm2, ymm6, 8
vpaddd ymm0, ymm3, ymm0
vpsrldq ymm3, ymm0, 8
vpaddd ymm6, ymm2, ymm6
vpaddd ymm0, ymm3, ymm0
vextracti128 xmm2, ymm6, 0x1
vextracti128 xmm1, ymm0, 0x1
vpaddd xmm6, xmm2, xmm6
vmovd [rcx], xmm6
vpaddd xmm1, xmm1, xmm0
vmovd ecx, xmm1
add eax, ecx
mov [r8], eax
.exit:
vzeroupper
mov eax, edx
ret
#ifdef __APPLE__
.data
.align 6
#else
.section .rodata
.p2align 6
#endif
.mul_T2:
.byte 64
.byte 63
.byte 62
.byte 61
.byte 60
.byte 59
.byte 58
.byte 57
.byte 56
.byte 55
.byte 54
.byte 53
.byte 52
.byte 51
.byte 50
.byte 49
.byte 48
.byte 47
.byte 46
.byte 45
.byte 44
.byte 43
.byte 42
.byte 41
.byte 40
.byte 39
.byte 38
.byte 37
.byte 36
.byte 35
.byte 34
.byte 33
.byte 32
.byte 31
.byte 30
.byte 29
.byte 28
.byte 27
.byte 26
.byte 25
.byte 24
.byte 23
.byte 22
.byte 21
.byte 20
.byte 19
.byte 18
.byte 17
.byte 16
.byte 15
.byte 14
.byte 13
.byte 12
.byte 11
.byte 10
.byte 9
.byte 8
.byte 7
.byte 6
.byte 5
.byte 4
.byte 3
.byte 2
.byte 1

View File

@@ -85,7 +85,6 @@ typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, _
#define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
#define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
__attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
__attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
__attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
@@ -246,7 +245,7 @@ __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf
// (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
__m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
__m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
__m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
__m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
// s2 += 32*s1
@@ -311,120 +310,7 @@ __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf
return i;
}
/*
AVX2 loop per 64 bytes:
int16 t1[16];
int16 t2[16];
for (int j = 0; j < 16; j++) {
t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
}
s2 += 64*s1 + (uint32)(
60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
) + 2080*CHAR_OFFSET;
s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
64*CHAR_OFFSET;
*/
__attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
{
if (len > 64) {
uint32 x[4] = {0};
__m128i ss1 = _mm_cvtsi32_si128(*ps1);
__m128i ss2 = _mm_cvtsi32_si128(*ps2);
const char mul_t1_buf[16] = {60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0};
__m128i tmp = _mm_load_si128((__m128i*) mul_t1_buf);
__m256i mul_t1 = _mm256_cvtepu8_epi16(tmp);
__m256i mul_const = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(4 | (3 << 8) | (2 << 16) | (1 << 24)));
__m256i mul_one;
mul_one = _mm256_abs_epi8(_mm256_cmpeq_epi16(mul_one,mul_one)); // set all vector elements to 1
for (; i < (len-64); i+=64) {
// Load ... 4*[int8*16]
__m256i in8_1, in8_2;
__m128i in8_1_low, in8_2_low, in8_1_high, in8_2_high;
in8_1_low = _mm_loadu_si128((__m128i_u*)&buf[i]);
in8_2_low = _mm_loadu_si128((__m128i_u*)&buf[i+16]);
in8_1_high = _mm_loadu_si128((__m128i_u*)&buf[i+32]);
in8_2_high = _mm_loadu_si128((__m128i_u*)&buf[i+48]);
in8_1 = _mm256_inserti128_si256(_mm256_castsi128_si256(in8_1_low), in8_1_high,1);
in8_2 = _mm256_inserti128_si256(_mm256_castsi128_si256(in8_2_low), in8_2_high,1);
// (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
// Fastest, even though multiply by 1
__m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
__m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
// (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
__m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
__m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
// s2 += 64*s1
ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 6));
// [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
__m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_epi32(sum_add32, 16));
sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_si256(sum_add32, 4));
sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_si256(sum_add32, 8));
// [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
__m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_epi32(sum_mul_add32, 16));
sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_si256(sum_mul_add32, 4));
sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_si256(sum_mul_add32, 8));
// s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
__m128i sum_add32_hi = _mm256_extracti128_si256(sum_add32, 0x1);
ss1 = _mm_add_epi32(ss1, _mm256_castsi256_si128(sum_add32));
ss1 = _mm_add_epi32(ss1, sum_add32_hi);
// s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
__m128i sum_mul_add32_hi = _mm256_extracti128_si256(sum_mul_add32, 0x1);
ss2 = _mm_add_epi32(ss2, _mm256_castsi256_si128(sum_mul_add32));
ss2 = _mm_add_epi32(ss2, sum_mul_add32_hi);
// [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
// We could've combined this with generating sum_add32 above and
// save an instruction but benchmarking shows that as being slower
__m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
// [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
__m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
// [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
// prefetch 2 cacheline ahead
_mm_prefetch(&buf[i + 160], _MM_HINT_T0);
// s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
__m128i mul32_hi = _mm256_extracti128_si256(mul32, 0x1);
ss2 = _mm_add_epi32(ss2, _mm256_castsi256_si128(mul32));
ss2 = _mm_add_epi32(ss2, mul32_hi);
#if CHAR_OFFSET != 0
// s1 += 32*CHAR_OFFSET
__m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
// s2 += 528*CHAR_OFFSET
char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
#endif
}
_mm_store_si128((__m128i_u*)x, ss1);
*ps1 = x[0];
_mm_store_si128((__m128i_u*)x, ss2);
*ps2 = x[0];
}
return i;
}
extern "C" __attribute__ ((target("avx2"))) int32 get_checksum1_avx2(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2);
static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
{
@@ -452,7 +338,7 @@ static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
uint32 s2 = 0;
// multiples of 64 bytes using AVX2 (if available)
i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
i = get_checksum1_avx2((schar*)buf1, len, i, &s1, &s2);
// multiples of 32 bytes using SSSE3 (if available)
i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
@@ -514,14 +400,14 @@ static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uin
int main() {
int i;
unsigned char* buf = (unsigned char*)malloc(BLOCK_LEN);
unsigned char* buf = (unsigned char*)aligned_alloc(64,BLOCK_LEN);
for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN);
benchmark("AVX2", get_checksum1_avx2, (schar*)buf, BLOCK_LEN);
free(buf);
return 0;