Make asm use more selectable

- Make the SIMD ASM code off by default. Use configure --enable-simd-asm to enable. - Allow MD5 ASM code to be requested even when OpenSSL is handling MD4 checksums. Use configure --enable-md5-asm to enable.
2025-12-23 23:28:17 -05:00 · 2022-03-03 17:00:57 -08:00
parent 26f4dbe12c
commit b81a509556
11 changed files with 283 additions and 109 deletions
--- a/Makefile.in
+++ b/Makefile.in
@@ -30,8 +30,9 @@ SHELL=/bin/sh
 .SUFFIXES:
 .SUFFIXES: .c .o

-SIMD_x86_64=simd-checksum-x86_64.o simd-checksum-avx2.o
-ASM_x86_64=lib/md5-asm-x86_64.o
+ROLL_SIMD_x86_64=simd-checksum-x86_64.o
+ROLL_ASM_x86_64=simd-checksum-avx2.o
+MD5_ASM_x86_64=lib/md5-asm-x86_64.o

 GENFILES=configure.sh aclocal.m4 config.h.in rsync.1 rsync.1.html \
 	 rsync-ssl.1 rsync-ssl.1.html rsyncd.conf.5 rsyncd.conf.5.html \
@@ -46,7 +47,7 @@ OBJS1=flist.o rsync.o generator.o receiver.o cleanup.o sender.o exclude.o \
 	util1.o util2.o main.o checksum.o match.o syscall.o log.o backup.o delete.o
 OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
 	usage.o fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
-OBJS3=progress.o pipe.o @ASM@ @SIMD@
+OBJS3=progress.o pipe.o @MD5_ASM@ @ROLL_SIMD@ @ROLL_ASM@
 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
 popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
 	popt/popthelp.o popt/poptparse.o
@@ -147,13 +148,13 @@ git-version.h: ALWAYS_RUN
 ALWAYS_RUN:

 simd-checksum-x86_64.o: simd-checksum-x86_64.cpp
-	@$(srcdir)/cmd-or-msg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-checksum-x86_64.cpp
+	@$(srcdir)/cmd-or-msg disable-roll-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-checksum-x86_64.cpp

 simd-checksum-avx2.o: simd-checksum-avx2.S
-	@$(srcdir)/cmd-or-msg disable-asm $(CC) $(CFLAGS) --include=$(srcdir)/rsync.h -DAVX2_ASM -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/simd-checksum-avx2.S
+	@$(srcdir)/cmd-or-msg disable-roll-asm $(CC) $(CFLAGS) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/simd-checksum-avx2.S

-lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.S config.h lib/md-defines.h
-	@$(srcdir)/cmd-or-msg disable-asm $(CC) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/lib/md5-asm-x86_64.S
+lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.S lib/md-defines.h
+	@$(srcdir)/cmd-or-msg disable-md5-asm $(CC) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/lib/md5-asm-x86_64.S

 tls$(EXEEXT): $(TLS_OBJ)
 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(TLS_OBJ) $(LIBS)
--- a/NEWS.md
+++ b/NEWS.md
@@ -136,7 +136,8 @@
   (keeping the behavior the same as before), so specifying `--info=nonreg0`
   can be used to turn the warnings off.

- - More ASM optimizations from Shark64.
+ - An optional asm optimization for the rolling checksum from Shark64. Enable
+   it with `./configure --enable-roll-asm`.

 - Using `--debug=FILTER` now outputs a caution message if a filter rule
   has trailing whitespace.
@@ -192,14 +193,24 @@
   using the output of `git describe` when building inside a non-shallow git
   checkout, though.)

- - Improved the IPv6 determination in configure.
+ - Renamed configure's `--enable-simd` option to `--enable-roll-simd` and added
+   the option `--enable-roll-asm` to use the new asm version of the code.  Both
+   are x86_64/amd64 only.

- - Made SIMD & ASM configure default to "no" on non-Linux hosts due to various
-   reports of problems on NetBSD & macOS hosts.  These tests were also tweaked
-   to allow enabling the feature on a host_cpu of amd64 (was only x86_64).
+ - Renamed configure's `--enable-asm` option to `--enable-md5-asm` to avoid
+   confusion with the asm option for the rolling checksum.  It is also honored
+   even when openssl crypto is in use.  This allows: normal MD4 & MD5, normal
+   MD4 + asm MD5, openssl MD4 & MD5, or openssl MD4 + asm MD5.
+
+ - Made SIMD & asm configure checks default to "no" on non-Linux hosts due to
+   various reports of problems on NetBSD & macOS hosts.  These were also
+   tweaked to allow enabling the feature on a host_cpu of amd64 (was only
+   allowed on x86_64 before).

 - Fixed configure to not fail at the SIMD check when cross-compiling.

+ - Improved the IPv6 determination in configure.
+
 - Compile the C files with `-pedantic-errors` (when possible) so that we will
   get warned if a static initialization overflows in the future (among other
   things).
--- a/checksum.c
+++ b/checksum.c
@@ -179,7 +179,7 @@ int canonical_checksum(int csum_type)
 	return 0;
 }

-#ifndef HAVE_SIMD /* See simd-checksum-*.cpp. */
+#ifndef USE_ROLL_SIMD /* See simd-checksum-*.cpp. */
 /*
  a simple 32 bit checksum that can be updated from either end
  (inspired by Mark Adler's Adler-32 checksum)
@@ -222,23 +222,23 @@ void get_checksum2(char *buf, int32 len, char *sum)
 	  }
 #endif
 	  case CSUM_MD5: {
-		MD5_CTX m5;
+		md5_context m5;
 		uchar seedbuf[4];
-		MD5_Init(&m5);
+		md5_begin(&m5);
 		if (proper_seed_order) {
 			if (checksum_seed) {
 				SIVALu(seedbuf, 0, checksum_seed);
-				MD5_Update(&m5, seedbuf, 4);
+				md5_update(&m5, seedbuf, 4);
 			}
-			MD5_Update(&m5, (uchar *)buf, len);
+			md5_update(&m5, (uchar *)buf, len);
 		} else {
-			MD5_Update(&m5, (uchar *)buf, len);
+			md5_update(&m5, (uchar *)buf, len);
 			if (checksum_seed) {
 				SIVALu(seedbuf, 0, checksum_seed);
-				MD5_Update(&m5, seedbuf, 4);
+				md5_update(&m5, seedbuf, 4);
 			}
 		}
-		MD5_Final((uchar *)sum, &m5);
+		md5_result(&m5, (uchar *)sum);
 		break;
 	  }
 	  case CSUM_MD4:
@@ -374,18 +374,18 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
 	  }
 #endif
 	  case CSUM_MD5: {
-		MD5_CTX m5;
+		md5_context m5;

-		MD5_Init(&m5);
+		md5_begin(&m5);

 		for (i = 0; i + CHUNK_SIZE <= len; i += CHUNK_SIZE)
-			MD5_Update(&m5, (uchar *)map_ptr(buf, i, CHUNK_SIZE), CHUNK_SIZE);
+			md5_update(&m5, (uchar *)map_ptr(buf, i, CHUNK_SIZE), CHUNK_SIZE);

 		remainder = (int32)(len - i);
 		if (remainder > 0)
-			MD5_Update(&m5, (uchar *)map_ptr(buf, i, remainder), remainder);
+			md5_update(&m5, (uchar *)map_ptr(buf, i, remainder), remainder);

-		MD5_Final((uchar *)sum, &m5);
+		md5_result(&m5, (uchar *)sum);
 		break;
 	  }
 	  case CSUM_MD4:
@@ -443,7 +443,7 @@ static union {
 #ifdef USE_OPENSSL
 	MD4_CTX m4;
 #endif
-	MD5_CTX m5;
+	md5_context m5;
 } ctx;
 #ifdef SUPPORT_XXHASH
 static XXH64_state_t* xxh64_state;
@@ -482,7 +482,7 @@ void sum_init(int csum_type, int seed)
 		break;
 #endif
 	  case CSUM_MD5:
-		MD5_Init(&ctx.m5);
+		md5_begin(&ctx.m5);
 		break;
 	  case CSUM_MD4:
 #ifdef USE_OPENSSL
@@ -532,7 +532,7 @@ void sum_update(const char *p, int32 len)
 		break;
 #endif
 	  case CSUM_MD5:
-		MD5_Update(&ctx.m5, (uchar *)p, len);
+		md5_update(&ctx.m5, (uchar *)p, len);
 		break;
 	  case CSUM_MD4:
 #ifdef USE_OPENSSL
@@ -597,7 +597,7 @@ int sum_end(char *sum)
 	  }
 #endif
 	  case CSUM_MD5:
-		MD5_Final((uchar *)sum, &ctx.m5);
+		md5_result(&ctx.m5, (uchar *)sum);
 		break;
 	  case CSUM_MD4:
 #ifdef USE_OPENSSL
--- a/configure.ac
+++ b/configure.ac
@@ -229,12 +229,12 @@ fi
 AC_DEFINE_UNQUOTED(NOBODY_USER, "$NOBODY_USER", [unprivileged user--e.g. nobody])
 AC_DEFINE_UNQUOTED(NOBODY_GROUP, "$NOBODY_GROUP", [unprivileged group for unprivileged user])

-# SIMD optimizations
-SIMD=
+# rolling-checksum SIMD optimizations
+ROLL_SIMD=

-AC_MSG_CHECKING([whether to enable SIMD optimizations])
-AC_ARG_ENABLE(simd,
-    AS_HELP_STRING([--enable-simd],[enable/disable to control SIMD optimizations (requires c++)]))
+AC_MSG_CHECKING([whether to enable rolling-checksum SIMD optimizations])
+AC_ARG_ENABLE(roll-simd,
+    AS_HELP_STRING([--enable-roll-simd],[enable/disable to control rolling-checksum SIMD optimizations (requires c++)]))

 # Clag is crashing with -g -O2, so we'll get rid of -g for now.
 CXXFLAGS=`echo "$CXXFLAGS" | sed 's/-g //'`
@@ -263,14 +263,14 @@ __attribute__ ((target("ssse3"))) void more_testing(char* buf, int len)
 }
 ]])

-if test x"$enable_simd" = x""; then
+if test x"$enable_roll_simd" = x""; then
    case "$host_os" in
 	*linux*) ;;
-	*) enable_simd=no ;;
+	*) enable_roll_simd=no ;;
    esac
 fi

-if test x"$enable_simd" != x"no"; then
+if test x"$enable_roll_simd" != x"no"; then
    # For x86-64 SIMD, g++ >=5 or clang++ >=7 is required
    if test x"$host_cpu" = x"x86_64" || test x"$host_cpu" = x"amd64"; then
 	AC_LANG(C++)
@@ -283,23 +283,23 @@ if test x"$enable_simd" != x"no"; then
 	AC_LANG(C)
 	if test x"$CXX_OK" = x"yes"; then
 	    # AC_MSG_RESULT() is called below.
-	    SIMD="$host_cpu"
-	elif test x"$enable_simd" = x"yes"; then
+	    ROLL_SIMD="$host_cpu"
+	elif test x"$enable_roll_simd" = x"yes"; then
 	    AC_MSG_RESULT(error)
-	    AC_MSG_ERROR(The SIMD compilation test failed.
-Omit --enable-simd to continue without it.)
+	    AC_MSG_ERROR(The rolling-checksum SIMD compilation test failed.
+Omit --enable-roll-simd to continue without it.)
 	fi
-    elif test x"$enable_simd" = x"yes"; then
+    elif test x"$enable_roll_simd" = x"yes"; then
        AC_MSG_RESULT(unavailable)
-        AC_MSG_ERROR(The SIMD optimizations are currently x86_64|amd64 only.
-Omit --enable-simd to continue without it.)
+        AC_MSG_ERROR(The rolling-checksum SIMD optimizations are currently x86_64|amd64 only.
+Omit --enable-roll-simd to continue without it.)
    fi
 fi

-if test x"$SIMD" != x""; then
-    AC_MSG_RESULT([yes ($SIMD)])
-    AC_DEFINE(HAVE_SIMD, 1, [Define to 1 to enable SIMD optimizations])
-    SIMD='$(SIMD_'"$SIMD)"
+if test x"$ROLL_SIMD" != x""; then
+    AC_MSG_RESULT([yes ($ROLL_SIMD)])
+    AC_DEFINE(USE_ROLL_SIMD, 1, [Define to 1 to enable rolling-checksum SIMD optimizations])
+    ROLL_SIMD='$(ROLL_SIMD_'"$ROLL_SIMD)"
    # We only use c++ for its target attribute dispatching, disable unneeded bulky features
    CXXFLAGS="$CXXFLAGS -fno-exceptions -fno-rtti"
    # Apple often has "g++" as a symlink for clang. Try to find out the truth.
@@ -311,7 +311,7 @@ else
    AC_MSG_RESULT(no)
 fi

-AC_SUBST(SIMD)
+AC_SUBST(ROLL_SIMD)

 AC_MSG_CHECKING([if assembler accepts noexecstack])
 OLD_CFLAGS="$CFLAGS"
@@ -433,45 +433,66 @@ if test x"$enable_openssl" != x"no"; then
 	err_msg="$err_msg$nl- Failed to find openssl/md4.h and openssl/md5.h for openssl crypto lib support."
 	no_lib="$no_lib openssl"
    fi
-    if test x"$enable_asm" != x"yes"; then
-	enable_asm=no
+    if test x"$enable_md5_asm" != x"yes"; then
+	enable_md5_asm=no
    fi
 else
    AC_MSG_RESULT(no)
 fi

-ASM=
+MD5_ASM=

-AC_MSG_CHECKING([whether to enable ASM optimizations])
-AC_ARG_ENABLE(asm,
-    AS_HELP_STRING([--enable-asm],[enable/disable to control ASM optimizations]))
+AC_MSG_CHECKING([whether to enable MD5 ASM optimizations])
+AC_ARG_ENABLE(md5-asm,
+    AS_HELP_STRING([--enable-md5-asm],[enable/disable to control MD5 ASM optimizations]))

-if test x"$enable_asm" = x""; then
+if test x"$enable_md5_asm" = x""; then
    case "$host_os" in
 	*linux*) ;;
-	*) enable_asm=no ;;
+	*) enable_md5_asm=no ;;
    esac
 fi

-if test x"$enable_asm" != x"no"; then
+if test x"$enable_md5_asm" != x"no"; then
    if test x"$host_cpu" = x"x86_64" || test x"$host_cpu" = x"amd64"; then
-	ASM="$host_cpu"
-    elif test x"$enable_asm" = x"yes"; then
+	MD5_ASM="$host_cpu"
+    elif test x"$enable_md5_asm" = x"yes"; then
        AC_MSG_RESULT(unavailable)
        AC_MSG_ERROR(The ASM optimizations are currently x86_64|amd64 only.
-Omit --enable-asm to continue without it.)
+Omit --enable-md5-asm to continue without it.)
    fi
 fi

-if test x"$ASM" != x""; then
-    AC_MSG_RESULT([yes ($ASM)])
-    AC_DEFINE(HAVE_ASM, 1, [Define to 1 to enable ASM optimizations])
-    ASM='$(ASM_'"$ASM)"
+if test x"$MD5_ASM" != x""; then
+    AC_MSG_RESULT([yes ($MD5_ASM)])
+    AC_DEFINE(USE_MD5_ASM, 1, [Define to 1 to enable MD5 ASM optimizations])
+    MD5_ASM='$(MD5_ASM_'"$MD5_ASM)"
 else
    AC_MSG_RESULT(no)
 fi

-AC_SUBST(ASM)
+AC_SUBST(MD5_ASM)
+
+ROLL_ASM=
+
+AC_MSG_CHECKING([whether to enable rolling-checksum ASM optimizations])
+AC_ARG_ENABLE(roll-asm,
+    AS_HELP_STRING([--enable-roll-asm],[enable/disable to control rolling-checksum ASM optimizations (requires --enable-roll-simd)]))
+
+if test x"$ROLL_SIMD" = x""; then
+    enable_roll_asm=no
+fi
+
+if test x"$enable_roll_asm" = x"yes"; then
+    ROLL_ASM="$host_cpu"
+    AC_MSG_RESULT([yes ($ROLL_ASM)])
+    AC_DEFINE(USE_ROLL_ASM, 1, [Define to 1 to enable rolling-checksum ASM optimizations (requires --enable-roll-simd)])
+    ROLL_ASM='$(ROLL_ASM_'"$ROLL_ASM)"
+else
+    AC_MSG_RESULT(no)
+fi
+
+AC_SUBST(ROLL_ASM)

 AC_MSG_CHECKING([whether to enable xxhash checksum support])
 AC_ARG_ENABLE([xxhash],
@@ -1421,10 +1442,6 @@ esac
 AC_CONFIG_FILES([Makefile lib/dummy zlib/dummy popt/dummy shconfig])
 AC_OUTPUT

-if test "$enable_openssl" = yes && test "$enable_asm" = yes; then
-    echo "*** Ignoring --enable-asm option -- using openssl for MD5 checksums ***"
-fi
-
 AC_MSG_RESULT()
 AC_MSG_RESULT([    rsync $PACKAGE_VERSION configuration successful])
 AC_MSG_RESULT()
--- a/lib/md5-asm-x86_64.S
+++ b/lib/md5-asm-x86_64.S
@@ -27,7 +27,7 @@
 #include "config.h"
 #include "md-defines.h"

-#if !defined USE_OPENSSL && CSUM_CHUNK == 64
+#ifdef USE_MD5_ASM /* { */

 #ifdef __APPLE__
 #define md5_process_asm _md5_process_asm
@@ -698,4 +698,4 @@ md5_process_asm:
 	pop	%rbp
 	ret

-#endif /* !USE_OPENSSL ... */
+#endif /* } USE_MD5_ASM */
--- a/lib/md5.c
+++ b/lib/md5.c
@@ -20,7 +20,7 @@

 #include "rsync.h"

-#ifndef USE_OPENSSL
+#if !defined USE_OPENSSL || USE_MD5_ASM /* { */
 void md5_begin(md_context *ctx)
 {
 	ctx->A = 0x67452301;
@@ -148,7 +148,10 @@ static void md5_process(md_context *ctx, const uchar data[CSUM_CHUNK])
 	ctx->D += D;
 }

-#if defined HAVE_ASM && CSUM_CHUNK == 64
+#ifdef USE_MD5_ASM
+#if CSUM_CHUNK != 64
+#error The MD5 ASM code does not support CSUM_CHUNK != 64
+#endif
 extern void md5_process_asm(md_context *ctx, const void *data, size_t num);
 #endif

@@ -176,20 +179,20 @@ void md5_update(md_context *ctx, const uchar *input, uint32 length)
 		left = 0;
 	}

-#if defined HAVE_ASM && CSUM_CHUNK == 64
+#ifdef USE_MD5_ASM /* { */
 	if (length >= CSUM_CHUNK) {
 		uint32 chunks = length / CSUM_CHUNK;
 		md5_process_asm(ctx, input, chunks);
 		length -= chunks * CSUM_CHUNK;
 		input += chunks * CSUM_CHUNK;
 	}
-#else
+#else /* } { */
 	while (length >= CSUM_CHUNK) {
 		md5_process(ctx, input);
 		length -= CSUM_CHUNK;
 		input  += CSUM_CHUNK;
 	}
-#endif
+#endif /* } */

 	if (length)
 		memcpy(ctx->buffer + left, input, length);
@@ -221,9 +224,9 @@ void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN])
 	SIVALu(digest, 8, ctx->C);
 	SIVALu(digest, 12, ctx->D);
 }
-#endif
+#endif /* } */

-#ifdef TEST_MD5
+#ifdef TEST_MD5 /* { */

 void get_md5(uchar *out, const uchar *input, int n)
 {
@@ -317,4 +320,4 @@ int main(int argc, char *argv[])
 	return 0;
 }

-#endif
+#endif /* } */
--- a/lib/mdigest.h
+++ b/lib/mdigest.h
@@ -17,12 +17,13 @@ void mdfour_begin(md_context *md);
 void mdfour_update(md_context *md, const uchar *in, uint32 length);
 void mdfour_result(md_context *md, uchar digest[MD4_DIGEST_LEN]);

-#ifndef USE_OPENSSL
-#define MD5_CTX md_context
-#define MD5_Init md5_begin
-#define MD5_Update md5_update
-#define MD5_Final(digest, cptr) md5_result(cptr, digest)
-
+#if defined USE_OPENSSL && !defined USE_MD5_ASM
+#define md5_context MD5_CTX
+#define md5_begin MD5_Init
+#define md5_update MD5_Update
+#define md5_result(cptr, digest) MD5_Final(digest, cptr)
+#else
+#define md5_context md_context
 void md5_begin(md_context *ctx);
 void md5_update(md_context *ctx, const uchar *input, uint32 length);
 void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN]);
--- a/rsync.h
+++ b/rsync.h
@@ -18,11 +18,6 @@
 * with this program; if not, visit the http://fsf.org website.
 */

-/* a non-zero CHAR_OFFSET makes the rolling sum stronger, but is
-   incompatible with older versions :-( */
-#define CHAR_OFFSET 0
-
-#ifndef AVX2_ASM /* do not include the rest of file for assembly */
 #define False 0
 #define True 1
 #define Unset (-1) /* Our BOOL values are always an int. */
@@ -43,6 +38,9 @@

 #define BACKUP_SUFFIX "~"

+/* a non-zero CHAR_OFFSET makes the rolling sum stronger, but is
+   incompatible with older versions :-( */
+#define CHAR_OFFSET 0

 /* These flags are only used during the flist transfer. */

@@ -1477,7 +1475,6 @@ const char *get_panic_action(void);
    fprintf(stderr, "%s in %s at line %d\n", msg, __FILE__, __LINE__); \
    exit_cleanup(RERR_UNSUPPORTED); \
 } while (0)
-#endif  /* AVX2_ASM */

 #ifdef HAVE_MALLINFO2
 #define MEM_ALLOC_INFO mallinfo2
--- a/simd-checksum-avx2.S
+++ b/simd-checksum-avx2.S
@@ -1,15 +1,21 @@
+#include "config.h"
+
+#ifdef USE_ROLL_ASM /* { */
+
+#define CHAR_OFFSET 0 /* Keep this the same as rsync.h, which isn't likely to change. */
+
 #ifdef __APPLE__
-#define get_checksum1_avx2  _get_checksum1_avx2
+#define get_checksum1_avx2_asm  _get_checksum1_avx2_asm
 #endif

 .intel_syntax noprefix
 .text

 	.p2align 5
-	.globl get_checksum1_avx2
+	.globl get_checksum1_avx2_asm

 # rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2
-get_checksum1_avx2:
+get_checksum1_avx2_asm:
 	vmovd	xmm6,[rcx] # load *ps1
 	lea	eax, [rsi-128] # at least 128 bytes to process?
 	cmp	edx, eax
@@ -167,3 +173,5 @@ get_checksum1_avx2:
 	.byte 3
 	.byte 2
 	.byte 1
+
+#endif /* } USE_ROLL_ASM */
--- a/simd-checksum-x86_64.cpp
+++ b/simd-checksum-x86_64.cpp
@@ -51,12 +51,12 @@
 * GCC 4.x are not supported to ease configure.ac logic.
 */

-#ifdef __x86_64__
-#ifdef __cplusplus
+#ifdef __x86_64__ /* { */
+#ifdef __cplusplus /* { */

 #include "rsync.h"

-#ifdef HAVE_SIMD
+#ifdef USE_ROLL_SIMD /* { */

 #include <immintrin.h>

@@ -85,6 +85,9 @@ typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, _
 #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
 #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))

+#ifndef USE_ROLL_ASM
+__attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
+#endif
 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
 __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }

@@ -245,7 +248,7 @@ __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf

            // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
            __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
-            __m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
+            __m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
            __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);

            // s2 += 32*s1
@@ -310,7 +313,127 @@ __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf
    return i;
 }

-extern "C" __attribute__ ((target("avx2"))) int32 get_checksum1_avx2(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2);
+#ifdef USE_ROLL_ASM /* { */
+
+extern "C" __attribute__ ((target("avx2"))) int32 get_checksum1_avx2_asm(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2);
+
+#else /* } { */
+
+/*
+  AVX2 loop per 64 bytes:
+    int16 t1[16];
+    int16 t2[16];
+    for (int j = 0; j < 16; j++) {
+      t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
+      t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
+    }
+    s2 += 64*s1 + (uint32)(
+              60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
+              t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
+          ) + 2080*CHAR_OFFSET;
+    s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
+          64*CHAR_OFFSET;
+ */
+
+__attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
+{
+    if (len > 64) {
+
+        uint32 x[4] = {0};
+        __m128i ss1 = _mm_cvtsi32_si128(*ps1);
+        __m128i ss2 = _mm_cvtsi32_si128(*ps2);
+
+        const char mul_t1_buf[16] = {60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0};
+	__m128i tmp = _mm_load_si128((__m128i*) mul_t1_buf);
+        __m256i mul_t1 = _mm256_cvtepu8_epi16(tmp);
+	__m256i mul_const = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(4 | (3 << 8) | (2 << 16) | (1 << 24)));
+        __m256i mul_one;
+       	    mul_one = _mm256_abs_epi8(_mm256_cmpeq_epi16(mul_one,mul_one)); // set all vector elements to 1
+
+        for (; i < (len-64); i+=64) {
+            // Load ... 4*[int8*16]
+            __m256i in8_1, in8_2;
+	    __m128i in8_1_low, in8_2_low, in8_1_high, in8_2_high;
+	    in8_1_low = _mm_loadu_si128((__m128i_u*)&buf[i]);
+	    in8_2_low = _mm_loadu_si128((__m128i_u*)&buf[i+16]);
+	    in8_1_high = _mm_loadu_si128((__m128i_u*)&buf[i+32]);
+	    in8_2_high = _mm_loadu_si128((__m128i_u*)&buf[i+48]);
+	    in8_1 = _mm256_inserti128_si256(_mm256_castsi128_si256(in8_1_low), in8_1_high,1);
+	    in8_2 = _mm256_inserti128_si256(_mm256_castsi128_si256(in8_2_low), in8_2_high,1);
+
+            // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
+            // Fastest, even though multiply by 1
+            __m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
+            __m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
+
+            // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
+            __m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
+            __m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
+
+            // s2 += 64*s1
+            ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 6));
+
+            // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
+            __m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
+            sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_epi32(sum_add32, 16));
+            sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_si256(sum_add32, 4));
+            sum_add32 = _mm256_add_epi16(sum_add32, _mm256_srli_si256(sum_add32, 8));
+
+            // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
+            __m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
+            sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_epi32(sum_mul_add32, 16));
+            sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_si256(sum_mul_add32, 4));
+            sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_srli_si256(sum_mul_add32, 8));
+
+            // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
+	    __m128i sum_add32_hi = _mm256_extracti128_si256(sum_add32, 0x1);
+            ss1 = _mm_add_epi32(ss1, _mm256_castsi256_si128(sum_add32));
+            ss1 = _mm_add_epi32(ss1, sum_add32_hi);
+
+            // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
+	    __m128i sum_mul_add32_hi = _mm256_extracti128_si256(sum_mul_add32, 0x1);
+            ss2 = _mm_add_epi32(ss2, _mm256_castsi256_si128(sum_mul_add32));
+            ss2 = _mm_add_epi32(ss2, sum_mul_add32_hi);
+
+            // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
+            // We could've combined this with generating sum_add32 above and
+            // save an instruction but benchmarking shows that as being slower
+            __m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
+
+            // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
+            __m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
+
+            // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
+            mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
+            mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
+	    // prefetch 2 cacheline ahead
+            _mm_prefetch(&buf[i + 160], _MM_HINT_T0);
+
+            // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
+	    __m128i mul32_hi = _mm256_extracti128_si256(mul32, 0x1);
+            ss2 = _mm_add_epi32(ss2, _mm256_castsi256_si128(mul32));
+            ss2 = _mm_add_epi32(ss2, mul32_hi);
+
+#if CHAR_OFFSET != 0
+            // s1 += 32*CHAR_OFFSET
+            __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
+            ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
+
+            // s2 += 528*CHAR_OFFSET
+            char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
+            ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
+#endif
+        }
+
+        _mm_store_si128((__m128i_u*)x, ss1);
+        *ps1 = x[0];
+        _mm_store_si128((__m128i_u*)x, ss2);
+        *ps2 = x[0];
+    }
+    return i;
+}
+
+#endif /* } !USE_ROLL_ASM */

 static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
 {
@@ -338,7 +461,11 @@ static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
    uint32 s2 = 0;

    // multiples of 64 bytes using AVX2 (if available)
-    i = get_checksum1_avx2((schar*)buf1, len, i, &s1, &s2);
+#ifdef USE_ROLL_ASM
+    i = get_checksum1_avx2_asm((schar*)buf1, len, i, &s1, &s2);
+#else
+    i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
+#endif

    // multiples of 32 bytes using SSSE3 (if available)
    i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
@@ -407,7 +534,11 @@ int main() {
    benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
    benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
    benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
-    benchmark("AVX2", get_checksum1_avx2, (schar*)buf, BLOCK_LEN);
+#ifdef USE_ROLL_ASM
+    benchmark("AVX2-ASM", get_checksum1_avx2_asm, (schar*)buf, BLOCK_LEN);
+#else
+    benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN);
+#endif

    free(buf);
    return 0;
@@ -417,6 +548,6 @@ int main() {
 #pragma clang optimize on
 #endif /* BENCHMARK_SIMD_CHECKSUM1 */

-#endif /* HAVE_SIMD */
-#endif /* __cplusplus */
-#endif /* __x86_64__ */
+#endif /* } USE_ROLL_SIMD */
+#endif /* } __cplusplus */
+#endif /* } __x86_64__ */
--- a/usage.c
+++ b/usage.c
@@ -139,21 +139,26 @@ static void print_info_flags(enum logcode f)

 	"*Optimizations",

-#ifndef HAVE_SIMD
+#ifndef USE_ROLL_SIMD
 		"no "
 #endif
-			"SIMD",
+			"SIMD-roll",

-#if !defined HAVE_ASM || defined USE_OPENSSL
+#ifndef USE_ROLL_ASM
 		"no "
 #endif
-			"asm",
+			"asm-roll",

 #ifndef USE_OPENSSL
 		"no "
 #endif
 			"openssl-crypto",

+#ifndef USE_MD5_ASM
+		"no "
+#endif
+			"asm-MD5",
+
 		NULL
 	};