From 69a0db63f61bf37afaa47b08d874c494214deea7 Mon Sep 17 00:00:00 2001 From: Andrey Prygunkov Date: Sun, 8 Oct 2017 20:49:13 +0200 Subject: [PATCH] #454: integrated node-yencode library by Anime Tosho 1) integrated the library; 2) splitted units by CPU architecture; 3) extended makefile and configure script to detect CPU architecture and use appropriate compiler flags; 4) runtime CPU features detection for x86 and ARM with dynamic code dispatching; 5) temporary (for test purposes) printing info about SIMD support to stdout on program startup; 6) new SIMD routines are not yet used in the program --- Makefile.am | 20 +- Makefile.in | 69 +++++- configure | 93 ++++--- configure.ac | 51 +++- daemon/main/nzbget.cpp | 1 + daemon/main/nzbget.h | 13 +- daemon/nntp/Decoder.cpp | 14 ++ daemon/nntp/Decoder.h | 1 + lib/yencode/ArmCrc.cpp | 94 +++++++ lib/yencode/NeonDecoder.cpp | 272 +++++++++++++++++++++ lib/yencode/PclmulCrc.cpp | 444 ++++++++++++++++++++++++++++++++++ lib/yencode/ScalarDecoder.cpp | 130 ++++++++++ lib/yencode/SimdInit.cpp | 141 +++++++++++ lib/yencode/Sse2Decoder.cpp | 230 ++++++++++++++++++ lib/yencode/Ssse3Decoder.cpp | 243 +++++++++++++++++++ lib/yencode/YEncode.h | 38 +++ nzbget.vcxproj | 19 +- 17 files changed, 1796 insertions(+), 77 deletions(-) create mode 100644 lib/yencode/ArmCrc.cpp create mode 100644 lib/yencode/NeonDecoder.cpp create mode 100644 lib/yencode/PclmulCrc.cpp create mode 100644 lib/yencode/ScalarDecoder.cpp create mode 100644 lib/yencode/SimdInit.cpp create mode 100644 lib/yencode/Sse2Decoder.cpp create mode 100644 lib/yencode/Ssse3Decoder.cpp create mode 100644 lib/yencode/YEncode.h diff --git a/Makefile.am b/Makefile.am index 346b3906..b1af410e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -214,6 +214,23 @@ nzbget_SOURCES += \ lib/par2/verificationpacket.h endif +# Simd decoder and Crc32 +nzbget_SOURCES += \ + lib/yencode/YEncode.h \ + lib/yencode/SimdInit.cpp \ + lib/yencode/ScalarDecoder.cpp \ + lib/yencode/Sse2Decoder.cpp \ + lib/yencode/Ssse3Decoder.cpp \ + lib/yencode/PclmulCrc.cpp \ + lib/yencode/NeonDecoder.cpp \ + lib/yencode/ArmCrc.cpp + +lib/yencode/Sse2Decoder.$(OBJEXT) : CXXFLAGS+=$(SSE2_CXXFLAGS) +lib/yencode/Ssse3Decoder.$(OBJEXT) : CXXFLAGS+=$(SSSE3_CXXFLAGS) +lib/yencode/PclmulCrc.$(OBJEXT) : CXXFLAGS+=$(PCLMUL_CXXFLAGS) +lib/yencode/NeonDecoder.$(OBJEXT) : CXXFLAGS+=$(NEON_CXXFLAGS) +lib/yencode/ArmCrc.$(OBJEXT) : CXXFLAGS+=$(ARMCRC_CXXFLAGS) + AM_CPPFLAGS = \ -I$(srcdir)/daemon/connect \ -I$(srcdir)/daemon/extension \ @@ -226,7 +243,8 @@ AM_CPPFLAGS = \ -I$(srcdir)/daemon/remote \ -I$(srcdir)/daemon/util \ -I$(srcdir)/daemon/nserv \ - -I$(srcdir)/lib/par2 + -I$(srcdir)/lib/par2 \ + -I$(srcdir)/lib/yencode if WITH_TESTS nzbget_SOURCES += \ diff --git a/Makefile.in b/Makefile.in index bb8743cc..27836422 100644 --- a/Makefile.in +++ b/Makefile.in @@ -312,6 +312,10 @@ am__nzbget_SOURCES_DIST = daemon/connect/Connection.cpp \ lib/par2/reedsolomon.h lib/par2/verificationhashtable.cpp \ lib/par2/verificationhashtable.h \ lib/par2/verificationpacket.cpp lib/par2/verificationpacket.h \ + lib/yencode/YEncode.h lib/yencode/SimdInit.cpp \ + lib/yencode/ScalarDecoder.cpp lib/yencode/Sse2Decoder.cpp \ + lib/yencode/Ssse3Decoder.cpp lib/yencode/PclmulCrc.cpp \ + lib/yencode/NeonDecoder.cpp lib/yencode/ArmCrc.cpp \ lib/catch/catch.h tests/suite/TestMain.cpp \ tests/suite/TestMain.h tests/suite/TestUtil.cpp \ tests/suite/TestUtil.h tests/main/CommandLineParserTest.cpp \ @@ -425,7 +429,13 @@ am_nzbget_OBJECTS = daemon/connect/Connection.$(OBJEXT) \ daemon/nserv/NntpServer.$(OBJEXT) \ daemon/nserv/NzbGenerator.$(OBJEXT) \ daemon/nserv/YEncoder.$(OBJEXT) code_revision.$(OBJEXT) \ - $(am__objects_1) $(am__objects_2) $(am__objects_3) + $(am__objects_1) lib/yencode/SimdInit.$(OBJEXT) \ + lib/yencode/ScalarDecoder.$(OBJEXT) \ + lib/yencode/Sse2Decoder.$(OBJEXT) \ + lib/yencode/Ssse3Decoder.$(OBJEXT) \ + lib/yencode/PclmulCrc.$(OBJEXT) \ + lib/yencode/NeonDecoder.$(OBJEXT) lib/yencode/ArmCrc.$(OBJEXT) \ + $(am__objects_2) $(am__objects_3) nzbget_OBJECTS = $(am_nzbget_OBJECTS) nzbget_LDADD = $(LDADD) am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; @@ -553,6 +563,7 @@ am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \ ACLOCAL = @ACLOCAL@ AMTAR = @AMTAR@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +ARMCRC_CXXFLAGS = @ARMCRC_CXXFLAGS@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ @@ -585,6 +596,7 @@ MAINT = @MAINT@ MAKE = @MAKE@ MAKEINFO = @MAKEINFO@ MKDIR_P = @MKDIR_P@ +NEON_CXXFLAGS = @NEON_CXXFLAGS@ OBJEXT = @OBJEXT@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ @@ -594,11 +606,14 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_URL = @PACKAGE_URL@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ +PCLMUL_CXXFLAGS = @PCLMUL_CXXFLAGS@ PKG_CONFIG = @PKG_CONFIG@ PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ +SSE2_CXXFLAGS = @SSE2_CXXFLAGS@ +SSSE3_CXXFLAGS = @SSSE3_CXXFLAGS@ STRIP = @STRIP@ TAR = @TAR@ VERSION = @VERSION@ @@ -668,6 +683,8 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ zlib_CFLAGS = @zlib_CFLAGS@ zlib_LIBS = @zlib_LIBS@ + +# Simd decoder and Crc32 nzbget_SOURCES = daemon/connect/Connection.cpp \ daemon/connect/Connection.h daemon/connect/TlsSocket.cpp \ daemon/connect/TlsSocket.h daemon/connect/WebDownloader.cpp \ @@ -760,14 +777,18 @@ nzbget_SOURCES = daemon/connect/Connection.cpp \ daemon/nserv/NntpServer.h daemon/nserv/NntpServer.cpp \ daemon/nserv/NzbGenerator.h daemon/nserv/NzbGenerator.cpp \ daemon/nserv/YEncoder.h daemon/nserv/YEncoder.cpp \ - code_revision.cpp $(am__append_1) $(am__append_2) \ - $(am__append_3) + code_revision.cpp $(am__append_1) lib/yencode/YEncode.h \ + lib/yencode/SimdInit.cpp lib/yencode/ScalarDecoder.cpp \ + lib/yencode/Sse2Decoder.cpp lib/yencode/Ssse3Decoder.cpp \ + lib/yencode/PclmulCrc.cpp lib/yencode/NeonDecoder.cpp \ + lib/yencode/ArmCrc.cpp $(am__append_2) $(am__append_3) AM_CPPFLAGS = -I$(srcdir)/daemon/connect -I$(srcdir)/daemon/extension \ -I$(srcdir)/daemon/feed -I$(srcdir)/daemon/frontend \ -I$(srcdir)/daemon/main -I$(srcdir)/daemon/nntp \ -I$(srcdir)/daemon/postprocess -I$(srcdir)/daemon/queue \ -I$(srcdir)/daemon/remote -I$(srcdir)/daemon/util \ - -I$(srcdir)/daemon/nserv -I$(srcdir)/lib/par2 $(am__append_4) + -I$(srcdir)/daemon/nserv -I$(srcdir)/lib/par2 \ + -I$(srcdir)/lib/yencode $(am__append_4) EXTRA_DIST = \ $(windows_FILES) \ $(osx_FILES) \ @@ -1313,6 +1334,26 @@ lib/par2/verificationhashtable.$(OBJEXT): lib/par2/$(am__dirstamp) \ lib/par2/$(DEPDIR)/$(am__dirstamp) lib/par2/verificationpacket.$(OBJEXT): lib/par2/$(am__dirstamp) \ lib/par2/$(DEPDIR)/$(am__dirstamp) +lib/yencode/$(am__dirstamp): + @$(MKDIR_P) lib/yencode + @: > lib/yencode/$(am__dirstamp) +lib/yencode/$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) lib/yencode/$(DEPDIR) + @: > lib/yencode/$(DEPDIR)/$(am__dirstamp) +lib/yencode/SimdInit.$(OBJEXT): lib/yencode/$(am__dirstamp) \ + lib/yencode/$(DEPDIR)/$(am__dirstamp) +lib/yencode/ScalarDecoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \ + lib/yencode/$(DEPDIR)/$(am__dirstamp) +lib/yencode/Sse2Decoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \ + lib/yencode/$(DEPDIR)/$(am__dirstamp) +lib/yencode/Ssse3Decoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \ + lib/yencode/$(DEPDIR)/$(am__dirstamp) +lib/yencode/PclmulCrc.$(OBJEXT): lib/yencode/$(am__dirstamp) \ + lib/yencode/$(DEPDIR)/$(am__dirstamp) +lib/yencode/NeonDecoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \ + lib/yencode/$(DEPDIR)/$(am__dirstamp) +lib/yencode/ArmCrc.$(OBJEXT): lib/yencode/$(am__dirstamp) \ + lib/yencode/$(DEPDIR)/$(am__dirstamp) tests/suite/$(am__dirstamp): @$(MKDIR_P) tests/suite @: > tests/suite/$(am__dirstamp) @@ -1455,6 +1496,7 @@ mostlyclean-compile: -rm -f daemon/remote/*.$(OBJEXT) -rm -f daemon/util/*.$(OBJEXT) -rm -f lib/par2/*.$(OBJEXT) + -rm -f lib/yencode/*.$(OBJEXT) -rm -f tests/feed/*.$(OBJEXT) -rm -f tests/main/*.$(OBJEXT) -rm -f tests/nntp/*.$(OBJEXT) @@ -1559,6 +1601,13 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@lib/par2/$(DEPDIR)/reedsolomon.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@lib/par2/$(DEPDIR)/verificationhashtable.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@lib/par2/$(DEPDIR)/verificationpacket.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/ArmCrc.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/NeonDecoder.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/PclmulCrc.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/ScalarDecoder.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/SimdInit.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/Sse2Decoder.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/Ssse3Decoder.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@tests/feed/$(DEPDIR)/FeedFilterTest.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@tests/main/$(DEPDIR)/CommandLineParserTest.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@tests/main/$(DEPDIR)/OptionsTest.Po@am__quote@ @@ -1940,6 +1989,8 @@ distclean-generic: -rm -f daemon/util/$(am__dirstamp) -rm -f lib/par2/$(DEPDIR)/$(am__dirstamp) -rm -f lib/par2/$(am__dirstamp) + -rm -f lib/yencode/$(DEPDIR)/$(am__dirstamp) + -rm -f lib/yencode/$(am__dirstamp) -rm -f tests/feed/$(DEPDIR)/$(am__dirstamp) -rm -f tests/feed/$(am__dirstamp) -rm -f tests/main/$(DEPDIR)/$(am__dirstamp) @@ -1964,7 +2015,7 @@ clean-am: clean-binPROGRAMS clean-generic mostlyclean-am distclean: distclean-am -rm -f $(am__CONFIG_DISTCLEAN_FILES) - -rm -rf ./$(DEPDIR) daemon/connect/$(DEPDIR) daemon/extension/$(DEPDIR) daemon/feed/$(DEPDIR) daemon/frontend/$(DEPDIR) daemon/main/$(DEPDIR) daemon/nntp/$(DEPDIR) daemon/nserv/$(DEPDIR) daemon/postprocess/$(DEPDIR) daemon/queue/$(DEPDIR) daemon/remote/$(DEPDIR) daemon/util/$(DEPDIR) lib/par2/$(DEPDIR) tests/feed/$(DEPDIR) tests/main/$(DEPDIR) tests/nntp/$(DEPDIR) tests/postprocess/$(DEPDIR) tests/queue/$(DEPDIR) tests/suite/$(DEPDIR) tests/util/$(DEPDIR) + -rm -rf ./$(DEPDIR) daemon/connect/$(DEPDIR) daemon/extension/$(DEPDIR) daemon/feed/$(DEPDIR) daemon/frontend/$(DEPDIR) daemon/main/$(DEPDIR) daemon/nntp/$(DEPDIR) daemon/nserv/$(DEPDIR) daemon/postprocess/$(DEPDIR) daemon/queue/$(DEPDIR) daemon/remote/$(DEPDIR) daemon/util/$(DEPDIR) lib/par2/$(DEPDIR) lib/yencode/$(DEPDIR) tests/feed/$(DEPDIR) tests/main/$(DEPDIR) tests/nntp/$(DEPDIR) tests/postprocess/$(DEPDIR) tests/queue/$(DEPDIR) tests/suite/$(DEPDIR) tests/util/$(DEPDIR) -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-hdr distclean-tags @@ -2015,7 +2066,7 @@ installcheck-am: maintainer-clean: maintainer-clean-am -rm -f $(am__CONFIG_DISTCLEAN_FILES) -rm -rf $(top_srcdir)/autom4te.cache - -rm -rf ./$(DEPDIR) daemon/connect/$(DEPDIR) daemon/extension/$(DEPDIR) daemon/feed/$(DEPDIR) daemon/frontend/$(DEPDIR) daemon/main/$(DEPDIR) daemon/nntp/$(DEPDIR) daemon/nserv/$(DEPDIR) daemon/postprocess/$(DEPDIR) daemon/queue/$(DEPDIR) daemon/remote/$(DEPDIR) daemon/util/$(DEPDIR) lib/par2/$(DEPDIR) tests/feed/$(DEPDIR) tests/main/$(DEPDIR) tests/nntp/$(DEPDIR) tests/postprocess/$(DEPDIR) tests/queue/$(DEPDIR) tests/suite/$(DEPDIR) tests/util/$(DEPDIR) + -rm -rf ./$(DEPDIR) daemon/connect/$(DEPDIR) daemon/extension/$(DEPDIR) daemon/feed/$(DEPDIR) daemon/frontend/$(DEPDIR) daemon/main/$(DEPDIR) daemon/nntp/$(DEPDIR) daemon/nserv/$(DEPDIR) daemon/postprocess/$(DEPDIR) daemon/queue/$(DEPDIR) daemon/remote/$(DEPDIR) daemon/util/$(DEPDIR) lib/par2/$(DEPDIR) lib/yencode/$(DEPDIR) tests/feed/$(DEPDIR) tests/main/$(DEPDIR) tests/nntp/$(DEPDIR) tests/postprocess/$(DEPDIR) tests/queue/$(DEPDIR) tests/suite/$(DEPDIR) tests/util/$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic @@ -2064,6 +2115,12 @@ uninstall-am: uninstall-binPROGRAMS uninstall-dist_docDATA \ .PRECIOUS: Makefile +lib/yencode/Sse2Decoder.$(OBJEXT) : CXXFLAGS+=$(SSE2_CXXFLAGS) +lib/yencode/Ssse3Decoder.$(OBJEXT) : CXXFLAGS+=$(SSSE3_CXXFLAGS) +lib/yencode/PclmulCrc.$(OBJEXT) : CXXFLAGS+=$(PCLMUL_CXXFLAGS) +lib/yencode/NeonDecoder.$(OBJEXT) : CXXFLAGS+=$(NEON_CXXFLAGS) +lib/yencode/ArmCrc.$(OBJEXT) : CXXFLAGS+=$(ARMCRC_CXXFLAGS) + # Note about "sed": # We need to make some changes in installed files. # On Linux "sed" has option "-i" for in-place-edit. Unfortunateley the BSD version of "sed" diff --git a/configure b/configure index 1fec8ccd..bf7bc702 100755 --- a/configure +++ b/configure @@ -628,6 +628,11 @@ LTLIBOBJS LIBOBJS WITH_TESTS_FALSE WITH_TESTS_TRUE +ARMCRC_CXXFLAGS +NEON_CXXFLAGS +PCLMUL_CXXFLAGS +SSSE3_CXXFLAGS +SSE2_CXXFLAGS zlib_LIBS zlib_CFLAGS nettle_LIBS @@ -5756,48 +5761,13 @@ fi done -for ac_header in sys/prctl.h +for ac_header in sys/prctl.h regex.h endian.h getopt.h do : - ac_fn_cxx_check_header_mongrel "$LINENO" "sys/prctl.h" "ac_cv_header_sys_prctl_h" "$ac_includes_default" -if test "x$ac_cv_header_sys_prctl_h" = xyes; then : + as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` +ac_fn_cxx_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" +if eval test \"x\$"$as_ac_Header"\" = x"yes"; then : cat >>confdefs.h <<_ACEOF -#define HAVE_SYS_PRCTL_H 1 -_ACEOF - -fi - -done - -for ac_header in regex.h -do : - ac_fn_cxx_check_header_mongrel "$LINENO" "regex.h" "ac_cv_header_regex_h" "$ac_includes_default" -if test "x$ac_cv_header_regex_h" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_REGEX_H 1 -_ACEOF - -fi - -done - -for ac_header in endian.h -do : - ac_fn_cxx_check_header_mongrel "$LINENO" "endian.h" "ac_cv_header_endian_h" "$ac_includes_default" -if test "x$ac_cv_header_endian_h" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_ENDIAN_H 1 -_ACEOF - -fi - -done - -for ac_header in getopt.h -do : - ac_fn_cxx_check_header_mongrel "$LINENO" "getopt.h" "ac_cv_header_getopt_h" "$ac_includes_default" -if test "x$ac_cv_header_getopt_h" = xyes; then : - cat >>confdefs.h <<_ACEOF -#define HAVE_GETOPT_H 1 +#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1 _ACEOF fi @@ -6569,12 +6539,12 @@ main () _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : - { $as_echo "$as_me:${as_lineno-$LINENO}: result: size_t" >&5 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: size_t" >&5 $as_echo "size_t" >&6; } - SOCKLEN_T=size_t + SOCKLEN_T=size_t else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext + cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #include @@ -6591,14 +6561,14 @@ main () _ACEOF if ac_fn_cxx_try_compile "$LINENO"; then : - { $as_echo "$as_me:${as_lineno-$LINENO}: result: int" >&5 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: int" >&5 $as_echo "int" >&6; } - SOCKLEN_T=int + SOCKLEN_T=int else - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: could not determine" >&5 + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: could not determine" >&5 $as_echo "$as_me: WARNING: could not determine" >&2;} - SOCKLEN_T=int + SOCKLEN_T=int fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext fi @@ -8366,6 +8336,35 @@ $as_echo "#define DISABLE_GZIP 1" >>confdefs.h fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use SIMD-optimized routines" >&5 +$as_echo_n "checking whether to use SIMD-optimized routines... " >&6; } +USE_SIMD=no +case $host_cpu in + i?86|x86_64) + SSE2_CXXFLAGS="-msse2" + SSSE3_CXXFLAGS="-mssse3" + PCLMUL_CXXFLAGS="-msse4.1 -mpclmul" + USE_SIMD=yes + ;; + arm) + NEON_CXXFLAGS="-mfpu=neon" + ARMCRC_CXXFLAGS="-march=armv8-a+crc" + USE_SIMD=yes + ;; + aarch64) + ARMCRC_CXXFLAGS="-march=armv8-a+crc" + USE_SIMD=yes + ;; +esac +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $USE_SIMD" >&5 +$as_echo "$USE_SIMD" >&6; } + + + + + + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use an empty SIGCHLD handler" >&5 $as_echo_n "checking whether to use an empty SIGCHLD handler... " >&6; } # Check whether --enable-sigchld-handler was given. diff --git a/configure.ac b/configure.ac index 60c05819..31567676 100644 --- a/configure.ac +++ b/configure.ac @@ -65,10 +65,7 @@ fi dnl dnl Checks for header files. dnl -AC_CHECK_HEADERS(sys/prctl.h) -AC_CHECK_HEADERS(regex.h) -AC_CHECK_HEADERS(endian.h) -AC_CHECK_HEADERS(getopt.h) +AC_CHECK_HEADERS(sys/prctl.h regex.h endian.h getopt.h) dnl @@ -148,7 +145,7 @@ if test "$FOUND" = "no"; then [ char* szHost; struct hostent hinfobuf; char* strbuf; int h_errnop; struct hostent* hinfo = gethostbyname_r(szHost, &hinfobuf, strbuf, 1024, &h_errnop); ], AC_MSG_RESULT([[yes, and it takes 5 arguments]]) - FOUND="yes" + FOUND="yes" AC_DEFINE([HAVE_GETHOSTBYNAME_R_5], 1, [Define to 1 if gethostbyname_r takes 5 arguments]), FOUND="no") @@ -198,17 +195,17 @@ AC_TRY_COMPILE([ #include #include ],[ (void)getsockopt (1, 1, 1, NULL, (size_t*)NULL)],[ - AC_MSG_RESULT(size_t) - SOCKLEN_T=size_t],[ - AC_TRY_COMPILE([ + AC_MSG_RESULT(size_t) + SOCKLEN_T=size_t],[ + AC_TRY_COMPILE([ #include #include #include ],[ (void)getsockopt (1, 1, 1, NULL, (int*)NULL)],[ - AC_MSG_RESULT(int) - SOCKLEN_T=int],[ - AC_MSG_WARN(could not determine) - SOCKLEN_T=int])])]) + AC_MSG_RESULT(int) + SOCKLEN_T=int],[ + AC_MSG_WARN(could not determine) + SOCKLEN_T=int])])]) AC_DEFINE_UNQUOTED(SOCKLEN_T, $SOCKLEN_T, [Determine what socket length (socklen_t) data type is]) @@ -540,6 +537,36 @@ else fi +dnl +dnl Determine if CPU supports SIMD instructions +dnl +AC_MSG_CHECKING(whether to use SIMD-optimized routines) +USE_SIMD=no +case $host_cpu in + i?86|x86_64) + SSE2_CXXFLAGS="-msse2" + SSSE3_CXXFLAGS="-mssse3" + PCLMUL_CXXFLAGS="-msse4.1 -mpclmul" + USE_SIMD=yes + ;; + arm) + NEON_CXXFLAGS="-mfpu=neon" + ARMCRC_CXXFLAGS="-march=armv8-a+crc" + USE_SIMD=yes + ;; + aarch64) + ARMCRC_CXXFLAGS="-march=armv8-a+crc" + USE_SIMD=yes + ;; +esac +AC_MSG_RESULT($USE_SIMD) +AC_SUBST([SSE2_CXXFLAGS]) +AC_SUBST([SSSE3_CXXFLAGS]) +AC_SUBST([PCLMUL_CXXFLAGS]) +AC_SUBST([NEON_CXXFLAGS]) +AC_SUBST([ARMCRC_CXXFLAGS]) + + dnl dnl Some Linux systems require an empty signal handler for SIGCHLD dnl in order for exit codes to be correctly delivered to parent process. diff --git a/daemon/main/nzbget.cpp b/daemon/main/nzbget.cpp index e6f5e000..b71b939b 100644 --- a/daemon/main/nzbget.cpp +++ b/daemon/main/nzbget.cpp @@ -262,6 +262,7 @@ void NZBGet::Init() #ifndef DISABLE_TLS TlsSocket::Init(); #endif + Decoder::Init(); } CreateGlobals(); diff --git a/daemon/main/nzbget.h b/daemon/main/nzbget.h index 43eba745..b93bbb80 100644 --- a/daemon/main/nzbget.h +++ b/daemon/main/nzbget.h @@ -59,9 +59,6 @@ compiled */ /* Define to 1 if variadic macros are supported */ #define HAVE_VARIADIC_MACROS -/* Define to 1 if libpar2 supports cancelling (needs a special patch) */ -#define HAVE_PAR2_CANCEL - /* Define to 1 if function GetAddrInfo is supported */ #define HAVE_GETADDRINFO @@ -95,6 +92,12 @@ compiled */ #define _WIN32_WINNT 0x0501 #endif +#ifdef _WIN64 +#define __amd64__ +#else +#define __i686__ +#endif + #ifdef _DEBUG // detection of memory leaks #define _CRTDBG_MAP_ALLOC @@ -312,6 +315,10 @@ typedef int pid_t; #define FOPEN_WB "wbN" #define FOPEN_AB "abN" +#define __SSE2__ +#define __SSSE3__ +#define __PCLMUL__ + #ifdef DEBUG // redefine "exit" to avoid printing memory leaks report when terminated because of wrong command line switches #define exit(code) ExitProcess(code) diff --git a/daemon/nntp/Decoder.cpp b/daemon/nntp/Decoder.cpp index bff055f7..ced8aaf1 100644 --- a/daemon/nntp/Decoder.cpp +++ b/daemon/nntp/Decoder.cpp @@ -22,9 +22,23 @@ #include "Decoder.h" #include "Log.h" #include "Util.h" +#include "YEncode.h" const char* Decoder::FormatNames[] = { "Unknown", "yEnc", "UU" }; +void Decoder::Init() +{ + YEncode::init(); + + debug("%s", YEncode::decode_simd ? "SIMD yEnc decoder can be used" : "SIMD yEnc decoder isn't available for this CPU"); + debug("%s", YEncode::crc32_simd ? "SIMD Crc32 routine can be used" : "SIMD Crc32 routine isn't available for this CPU"); + debug("%s", YEncode::inc_crc32_simd ? "SIMD Crc32 (incremental) routine can be used" : "SIMD Crc32 (incremental) routine isn't available for this CPU"); + + printf("%s\n", YEncode::decode_simd ? "SIMD yEnc decoder can be used" : "SIMD yEnc decoder isn't available for this CPU"); + printf("%s\n", YEncode::crc32_simd ? "SIMD Crc32 routine can be used" : "SIMD Crc32 routine isn't available for this CPU"); + printf("%s\n", YEncode::inc_crc32_simd ? "SIMD Crc32 (incremental) routine can be used" : "SIMD Crc32 (incremental) routine isn't available for this CPU"); +} + void Decoder::Clear() { m_articleFilename.Clear(); diff --git a/daemon/nntp/Decoder.h b/daemon/nntp/Decoder.h index a6891aea..16894bfa 100644 --- a/daemon/nntp/Decoder.h +++ b/daemon/nntp/Decoder.h @@ -45,6 +45,7 @@ public: static const char* FormatNames[]; + static void Init(); virtual ~Decoder() {} virtual EStatus Check() = 0; virtual void Clear(); diff --git a/lib/yencode/ArmCrc.cpp b/lib/yencode/ArmCrc.cpp new file mode 100644 index 00000000..85a8a973 --- /dev/null +++ b/lib/yencode/ArmCrc.cpp @@ -0,0 +1,94 @@ +/* + * Based on node-yencode library by Anime Tosho: + * https://github.com/animetosho/node-yencode + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +// inspired off https://github.com/jocover/crc32_armv8/blob/master/crc32_armv8.c + +#include "nzbget.h" + +#ifdef __ARM_FEATURE_CRC32 +#include +#endif + +namespace YEncode +{ +#ifdef __ARM_FEATURE_CRC32 + +inline uint32_t crc_arm(uint32_t crc, const unsigned char *src, long len) { + // initial alignment + if (len >= 16) { // 16 is an arbitrary number; it just needs to be >=8 + if ((uintptr_t)src & sizeof(uint8_t)) { + crc = __crc32b(crc, *src); + src++; + len--; + } + if ((uintptr_t)src & sizeof(uint16_t)) { + crc = __crc32h(crc, *((uint16_t *)src)); + src += sizeof(uint16_t); + len -= sizeof(uint16_t); + } + +#ifdef __aarch64__ + if ((uintptr_t)src & sizeof(uint32_t)) { + crc = __crc32w(crc, *((uint32_t *)src)); + src += sizeof(uint32_t); + len -= sizeof(uint32_t); + } + } + while ((len -= sizeof(uint64_t)) >= 0) { + crc = __crc32d(crc, *((uint64_t *)src)); + src += sizeof(uint64_t); + } + if (len & sizeof(uint32_t)) { + crc = __crc32w(crc, *((uint32_t *)src)); + src += sizeof(uint32_t); + } +#else + } + while ((len -= sizeof(uint32_t)) >= 0) { + crc = __crc32w(crc, *((uint32_t *)src)); + src += sizeof(uint32_t); + } +#endif + if (len & sizeof(uint16_t)) { + crc = __crc32h(crc, *((uint16_t *)src)); + src += sizeof(uint16_t); + } + if (len & sizeof(uint8_t)) + crc = __crc32b(crc, *src); + + return crc; +} + +uint32_t do_crc32_arm(const unsigned char *src, long len) +{ + return ~crc_arm(~0, src, len); +} + +extern uint32_t (*crc32_arm)(const unsigned char *src, long len); +extern uint32_t (*inc_crc32_simd)(uint32_t crc, const unsigned char* src, long len); +#endif + +void init_crc32_arm() +{ +#ifdef __ARM_FEATURE_CRC32 + crc32_arm = &do_crc32_arm; + inc_crc32_simd = &crc_arm; +#endif +} + +} diff --git a/lib/yencode/NeonDecoder.cpp b/lib/yencode/NeonDecoder.cpp new file mode 100644 index 00000000..14e008f5 --- /dev/null +++ b/lib/yencode/NeonDecoder.cpp @@ -0,0 +1,272 @@ +/* + * Based on node-yencode library by Anime Tosho: + * https://github.com/animetosho/node-yencode + * + * Copyright (C) 2017 Anime Tosho (animetosho) + * Copyright (C) 2017 Andrey Prygunkov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "nzbget.h" + +#include "YEncode.h" + +#ifdef __ARM_NEON +#include +#endif + +namespace YEncode +{ +#ifdef __ARM_NEON + +// combine two 8-bit ints into a 16-bit one +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define UINT16_PACK(a, b) ((a) | ((b) << 8)) +#else +#define UINT16_PACK(a, b) (((a) << 8) | (b)) +#endif + +// table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable +static const unsigned char BitsSetTable256[256] = +{ +# define B2(n) n, n+1, n+1, n+2 +# define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2) +# define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2) + B6(0), B6(1), B6(1), B6(2) +#undef B2 +#undef B4 +#undef B6 +}; + +static uint16_t neon_movemask(uint8x16_t in) { + uint8x16_t mask = vandq_u8(in, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}); +# if defined(__aarch64__) && 0 + // TODO: is this better? + return (vaddv_u8(vget_high_u8(mask)) << 8) | vaddv_u8(vget_low_u8(mask)); +# else + uint8x8_t res = vpadd_u8(vget_low_u8(mask), vget_high_u8(mask)); + res = vpadd_u8(res, res); + res = vpadd_u8(res, res); + return vget_lane_u16(vreinterpret_u16_u8(res), 0); +# endif +} + +uint8_t eqFixLUT[256]; +alignas(32) uint8x8_t eqAddLUT[256]; +alignas(32) uint8x8_t unshufLUT[256]; +alignas(32) static const uint8_t pshufb_combine_table[272] = { + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80, + 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80, + 0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80, + 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80, + 0x00,0x01,0x02,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80, + 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80, + 0x00,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80, + 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, +}; + +size_t do_decode_neon(const unsigned char* src, unsigned char* dest, size_t len, char* state) { + if(len <= sizeof(uint8x16_t)*2) return decode_scalar(src, dest, len, state); + + unsigned char *p = dest; // destination pointer + unsigned long i = 0; // input position + unsigned char escFirst = 0; // input character; first char needs escaping + unsigned int nextMask = 0; + char tState = 0; + char* pState = state ? state : &tState; + if((uintptr_t)src & ((sizeof(uint8x16_t)-1))) { + // find source memory alignment + unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(uint8x16_t)-1)) & ~(sizeof(uint8x16_t)-1)); + + i = aSrc - src; + p += decode_scalar(src, dest, i, pState); + } + + // handle finicky case of \r\n. straddled across initial boundary + if(*pState == 0 && i+1 < len && src[i] == '.') + nextMask = 1; + else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.')) + nextMask = 2; + + escFirst = *pState == 1; + + if(i + (sizeof(uint8x16_t)+1) < len) { + // our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking) + size_t dLen = len - (sizeof(uint8x16_t)+1); + dLen = ((dLen-i) + 0xf) & ~0xf; + uint8_t* dSrc = (uint8_t*)src + dLen + i; + long dI = -dLen; + i += dLen; + + for(; dI; dI += sizeof(uint8x16_t)) { + uint8x16_t data = vld1q_u8(dSrc + dI); + + // search for special chars + uint8x16_t cmpEq = vceqq_u8(data, vdupq_n_u8('=')), + cmp = vorrq_u8( + vorrq_u8( + vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0a0d))), // \r\n + vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0d0a))) // \n\r + ), + cmpEq + ); + uint16_t mask = neon_movemask(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later + + uint8x16_t oData; + if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers? + // first byte needs escaping due to preceeding = in last loop iteration + oData = vsubq_u8(data, (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42}); + } else { + oData = vsubq_u8(data, vdupq_n_u8(42)); + } + mask &= ~escFirst; + mask |= nextMask; + + if (mask != 0) { + // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant + // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that + + // firstly, resolve invalid sequences of = to deal with cases like '====' + uint16_t maskEq = neon_movemask(cmpEq); + uint16_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst]; + maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp; + + escFirst = (maskEq >> (sizeof(uint8x16_t)-1)); + // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed + maskEq <<= 1; + mask &= ~maskEq; + + // unescape chars following `=` + oData = vaddq_u8( + oData, + vcombine_u8( + vld1_u8((uint8_t*)(eqAddLUT + (maskEq&0xff))), + vld1_u8((uint8_t*)(eqAddLUT + ((maskEq>>8)&0xff))) + ) + ); + + // handle \r\n. sequences + // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing + // find instances of \r\n + uint8x16_t tmpData1, tmpData2; + uint8x16_t nextData = vld1q_u8(dSrc + dI + sizeof(uint8x16_t)); + tmpData1 = vextq_u8(data, nextData, 1); + tmpData2 = vextq_u8(data, nextData, 2); + uint8x16_t cmp1 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(data), vdupq_n_u16(0x0a0d))); + uint8x16_t cmp2 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData1), vdupq_n_u16(0x0a0d))); + // prepare to merge the two comparisons + cmp1 = vextq_u8(cmp1, vdupq_n_u8(0), 1); + // find all instances of . + tmpData2 = vceqq_u8(tmpData2, vdupq_n_u8('.')); + // merge matches of \r\n with those for . + uint16_t killDots = neon_movemask( + vandq_u8(tmpData2, vorrq_u8(cmp1, cmp2)) + ); + mask |= (killDots << 2) & 0xffff; + nextMask = killDots >> (sizeof(uint8x16_t)-2); + + // all that's left is to 'compress' the data (skip over masked chars) + unsigned char skipped = BitsSetTable256[mask & 0xff]; + // lookup compress masks and shuffle + oData = vcombine_u8( + vtbl1_u8(vget_low_u8(oData), vld1_u8((uint8_t*)(unshufLUT + (mask&0xff)))), + vtbl1_u8(vget_high_u8(oData), vld1_u8((uint8_t*)(unshufLUT + (mask>>8)))) + ); + // compact down + uint8x16_t compact = vld1q_u8(pshufb_combine_table + skipped*sizeof(uint8x16_t)); +# ifdef __aarch64__ + oData = vqtbl1q_u8(oData, compact); +# else + uint8x8x2_t dataH = {vget_low_u8(oData), vget_high_u8(oData)}; + oData = vcombine_u8(vtbl2_u8(dataH, vget_low_u8(compact)), + vtbl2_u8(dataH, vget_high_u8(compact))); +# endif + vst1q_u8(p, oData); + + // increment output position + p += sizeof(uint8x16_t) - skipped - BitsSetTable256[mask >> 8]; + + } else { + vst1q_u8(p, oData); + p += sizeof(uint8x16_t); + escFirst = 0; + nextMask = 0; + } + } + + if(escFirst) *pState = 1; // escape next character + else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n + else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r + else *pState = 3; + } + + // end alignment + if(i < len) { + p += decode_scalar(src + i, p, len - i, pState); + } + + return p - dest; +} + +extern size_t (*decode_neon)(const unsigned char* src, unsigned char* dest, size_t len, char* state); +#endif + +void init_decode_neon() { +#ifdef __ARM_NEON + decode_neon = &do_decode_neon; + + for(int i=0; i<256; i++) { + int k = i; + uint8_t res[8]; + int p = 0; + + // fix LUT + k = i; + p = 0; + for(int j=0; j<8; j++) { + k = i >> j; + if(k & 1) { + p |= 1 << j; + j++; + } + } + eqFixLUT[i] = p; + + // sub LUT + k = i; + for(int j=0; j<8; j++) { + res[j] = (k & 1) ? 192 /* == -64 */ : 0; + k >>= 1; + } + vst1_u8((uint8_t*)(eqAddLUT + i), vld1_u8(res)); + + k = i; + p = 0; + for(int j=0; j<8; j++) { + if(!(k & 1)) { + res[p++] = j; + } + k >>= 1; + } + for(; p<8; p++) + res[p] = 0; + vst1_u8((uint8_t*)(unshufLUT + i), vld1_u8(res)); + } +#endif +} + +} diff --git a/lib/yencode/PclmulCrc.cpp b/lib/yencode/PclmulCrc.cpp new file mode 100644 index 00000000..7080fa62 --- /dev/null +++ b/lib/yencode/PclmulCrc.cpp @@ -0,0 +1,444 @@ +/* + * Based on node-yencode library by Anime Tosho: + * https://github.com/animetosho/node-yencode + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +// taken from zlib-ng / Intel's zlib patch, modified to remove zlib dependencies +/* + * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ + * instruction. + * + * A white paper describing this algorithm can be found at: + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + * + * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Authors: + * Wajdi Feghali + * Jim Guilford + * Vinodh Gopal + * Erdinc Ozturk + * Jim Kukunas + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "nzbget.h" + +#ifdef __PCLMUL__ +#include +#endif + +namespace YEncode +{ +#ifdef __PCLMUL__ + +void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { + const __m128i xmm_fold4 = _mm_set_epi32( + 0x00000001, 0x54442bd4, + 0x00000001, 0xc6e41596); + + __m128i x_tmp3; + __m128 ps_crc0, ps_crc3, ps_res; + + x_tmp3 = *xmm_crc3; + + *xmm_crc3 = *xmm_crc0; + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); + ps_crc0 = _mm_castsi128_ps(*xmm_crc0); + ps_crc3 = _mm_castsi128_ps(*xmm_crc3); + ps_res = _mm_xor_ps(ps_crc0, ps_crc3); + + *xmm_crc0 = *xmm_crc1; + *xmm_crc1 = *xmm_crc2; + *xmm_crc2 = x_tmp3; + *xmm_crc3 = _mm_castps_si128(ps_res); +} + +void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { + const __m128i xmm_fold4 = _mm_set_epi32( + 0x00000001, 0x54442bd4, + 0x00000001, 0xc6e41596); + + __m128i x_tmp3, x_tmp2; + __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20; + + x_tmp3 = *xmm_crc3; + x_tmp2 = *xmm_crc2; + + *xmm_crc3 = *xmm_crc1; + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); + ps_crc3 = _mm_castsi128_ps(*xmm_crc3); + ps_crc1 = _mm_castsi128_ps(*xmm_crc1); + ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1); + + *xmm_crc2 = *xmm_crc0; + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); + ps_crc0 = _mm_castsi128_ps(*xmm_crc0); + ps_crc2 = _mm_castsi128_ps(*xmm_crc2); + ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2); + + *xmm_crc0 = x_tmp2; + *xmm_crc1 = x_tmp3; + *xmm_crc2 = _mm_castps_si128(ps_res20); + *xmm_crc3 = _mm_castps_si128(ps_res31); +} + +void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { + const __m128i xmm_fold4 = _mm_set_epi32( + 0x00000001, 0x54442bd4, + 0x00000001, 0xc6e41596); + + __m128i x_tmp3; + __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10; + + x_tmp3 = *xmm_crc3; + + *xmm_crc3 = *xmm_crc2; + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10); + ps_crc2 = _mm_castsi128_ps(*xmm_crc2); + ps_crc3 = _mm_castsi128_ps(*xmm_crc3); + ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3); + + *xmm_crc2 = *xmm_crc1; + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10); + ps_crc1 = _mm_castsi128_ps(*xmm_crc1); + ps_crc2 = _mm_castsi128_ps(*xmm_crc2); + ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2); + + *xmm_crc1 = *xmm_crc0; + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10); + ps_crc0 = _mm_castsi128_ps(*xmm_crc0); + ps_crc1 = _mm_castsi128_ps(*xmm_crc1); + ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1); + + *xmm_crc0 = x_tmp3; + *xmm_crc1 = _mm_castps_si128(ps_res10); + *xmm_crc2 = _mm_castps_si128(ps_res21); + *xmm_crc3 = _mm_castps_si128(ps_res32); +} + +void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { + const __m128i xmm_fold4 = _mm_set_epi32( + 0x00000001, 0x54442bd4, + 0x00000001, 0xc6e41596); + + __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3; + __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3; + __m128 ps_t0, ps_t1, ps_t2, ps_t3; + __m128 ps_res0, ps_res1, ps_res2, ps_res3; + + x_tmp0 = *xmm_crc0; + x_tmp1 = *xmm_crc1; + x_tmp2 = *xmm_crc2; + x_tmp3 = *xmm_crc3; + + *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01); + x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10); + ps_crc0 = _mm_castsi128_ps(*xmm_crc0); + ps_t0 = _mm_castsi128_ps(x_tmp0); + ps_res0 = _mm_xor_ps(ps_crc0, ps_t0); + + *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01); + x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10); + ps_crc1 = _mm_castsi128_ps(*xmm_crc1); + ps_t1 = _mm_castsi128_ps(x_tmp1); + ps_res1 = _mm_xor_ps(ps_crc1, ps_t1); + + *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01); + x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10); + ps_crc2 = _mm_castsi128_ps(*xmm_crc2); + ps_t2 = _mm_castsi128_ps(x_tmp2); + ps_res2 = _mm_xor_ps(ps_crc2, ps_t2); + + *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01); + x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10); + ps_crc3 = _mm_castsi128_ps(*xmm_crc3); + ps_t3 = _mm_castsi128_ps(x_tmp3); + ps_res3 = _mm_xor_ps(ps_crc3, ps_t3); + + *xmm_crc0 = _mm_castps_si128(ps_res0); + *xmm_crc1 = _mm_castps_si128(ps_res1); + *xmm_crc2 = _mm_castps_si128(ps_res2); + *xmm_crc3 = _mm_castps_si128(ps_res3); +} + +alignas(32) const unsigned pshufb_shf_table[60] = { + 0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */ + 0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */ + 0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */ + 0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */ + 0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */ + 0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */ + 0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */ + 0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */ + 0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */ + 0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/ + 0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/ + 0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/ + 0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/ + 0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/ + 0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/ +}; + +void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, + __m128i *xmm_crc2, __m128i *xmm_crc3, __m128i *xmm_crc_part) { + + const __m128i xmm_fold4 = _mm_set_epi32( + 0x00000001, 0x54442bd4, + 0x00000001, 0xc6e41596); + const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080); + + __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3; + __m128i xmm_a0_0, xmm_a0_1; + __m128 ps_crc3, psa0_0, psa0_1, ps_res; + + xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1)); + xmm_shr = xmm_shl; + xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3); + + xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl); + + *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr); + xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl); + *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1); + + *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr); + xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl); + *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2); + + *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr); + xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl); + *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3); + + *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr); + *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl); + *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part); + + xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10); + xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01); + + ps_crc3 = _mm_castsi128_ps(*xmm_crc3); + psa0_0 = _mm_castsi128_ps(xmm_a0_0); + psa0_1 = _mm_castsi128_ps(xmm_a0_1); + + ps_res = _mm_xor_ps(ps_crc3, psa0_0); + ps_res = _mm_xor_ps(ps_res, psa0_1); + + *xmm_crc3 = _mm_castps_si128(ps_res); +} + +alignas(16) const unsigned crc_k[] = { + 0xccaa009e, 0x00000000, /* rk1 */ + 0x751997d0, 0x00000001, /* rk2 */ + 0xccaa009e, 0x00000000, /* rk5 */ + 0x63cd6124, 0x00000001, /* rk6 */ + 0xf7011640, 0x00000001, /* rk7 */ + 0xdb710640, 0x00000001 /* rk8 */ +}; + +alignas(16) const unsigned crc_mask[4] = { + 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 +}; + +alignas(16) const unsigned crc_mask2[4] = { + 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF +}; + +inline uint32_t crc_fold(const unsigned char *src, long len) { + unsigned long algn_diff; + __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; + + __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); + __m128i xmm_crc1 = _mm_setzero_si128(); + __m128i xmm_crc2 = _mm_setzero_si128(); + __m128i xmm_crc3 = _mm_setzero_si128(); + __m128i xmm_crc_part; + + if (len < 16) { + if (len == 0) + return 0; + xmm_crc_part = _mm_loadu_si128((__m128i *)src); + goto partial; + } + + algn_diff = (0 - (uintptr_t)src) & 0xF; + if (algn_diff) { + xmm_crc_part = _mm_loadu_si128((__m128i *)src); + + src += algn_diff; + len -= algn_diff; + + partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, + &xmm_crc_part); + } + + while ((len -= 64) >= 0) { + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + xmm_t3 = _mm_load_si128((__m128i *)src + 3); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); + + src += 64; + } + + /* + * len = num bytes left - 64 + */ + if (len + 16 >= 0) { + len += 16; + + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + + fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); + + if (len == 0) + goto done; + + xmm_crc_part = _mm_load_si128((__m128i *)src + 3); + } else if (len + 32 >= 0) { + len += 32; + + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + + fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); + + if (len == 0) + goto done; + + xmm_crc_part = _mm_load_si128((__m128i *)src + 2); + } else if (len + 48 >= 0) { + len += 48; + + xmm_t0 = _mm_load_si128((__m128i *)src); + + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + + if (len == 0) + goto done; + + xmm_crc_part = _mm_load_si128((__m128i *)src + 1); + } else { + len += 64; + if (len == 0) + goto done; + xmm_crc_part = _mm_load_si128((__m128i *)src); + } + +partial: + partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, + &xmm_crc_part); +done: +{ + const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask); + const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2); + + uint32_t crc; + __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold; + + /* + * k1 + */ + crc_fold = _mm_load_si128((__m128i *)crc_k); + + x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10); + xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01); + xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0); + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0); + + x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10); + xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01); + xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1); + + x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10); + xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01); + xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); + + /* + * k5 + */ + crc_fold = _mm_load_si128((__m128i *)crc_k + 1); + + xmm_crc0 = xmm_crc3; + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0); + xmm_crc0 = _mm_srli_si128(xmm_crc0, 8); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0); + + xmm_crc0 = xmm_crc3; + xmm_crc3 = _mm_slli_si128(xmm_crc3, 4); + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0); + xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2); + + /* + * k7 + */ + xmm_crc1 = xmm_crc3; + xmm_crc2 = xmm_crc3; + crc_fold = _mm_load_si128((__m128i *)crc_k + 2); + + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); + xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask); + + xmm_crc2 = xmm_crc3; + xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1); + + crc = _mm_extract_epi32(xmm_crc3, 2); + return ~crc; +} + +} + +extern uint32_t (*crc32_pclmul)(const unsigned char *src, long len); +#endif + +void init_crc32_pclmul() +{ +#ifdef __PCLMUL__ + crc32_pclmul = &crc_fold; +#endif +} + +} diff --git a/lib/yencode/ScalarDecoder.cpp b/lib/yencode/ScalarDecoder.cpp new file mode 100644 index 00000000..f128e895 --- /dev/null +++ b/lib/yencode/ScalarDecoder.cpp @@ -0,0 +1,130 @@ +/* + * Based on node-yencode library by Anime Tosho: + * https://github.com/animetosho/node-yencode + * + * Copyright (C) 2017 Anime Tosho (animetosho) + * Copyright (C) 2017 Andrey Prygunkov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "nzbget.h" + +namespace YEncode +{ + +// combine two 8-bit ints into a 16-bit one +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define UINT16_PACK(a, b) ((a) | ((b) << 8)) +#else +#define UINT16_PACK(a, b) (((a) << 8) | (b)) +#endif + +// state var: refers to the previous state - only used for incremental processing +// 0: previous characters are `\r\n` OR there is no previous character +// 1: previous character is `=` +// 2: previous character is `\r` +// 3: previous character is none of the above +size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len, char* state) { + unsigned char *es = (unsigned char*)src + len; // end source pointer + unsigned char *p = dest; // destination pointer + long i = -len; // input position + unsigned char c; // input character + + if (len < 1) return 0; + + if (state) switch (*state) { + case 1: + c = es[i]; + *p++ = c - 42 - 64; + i++; + if (c == '\r' && i < 0) { + *state = 2; + // fall through to case 2 + } + else { + *state = 3; + break; + } + case 2: + if (es[i] != '\n') break; + i++; + *state = 0; // now `\r\n` + if (i >= 0) return 0; + case 0: + // skip past first dot + if (es[i] == '.') i++; + } + else // treat as *state == 0 + if (es[i] == '.') i++; + + for (; i < -2; i++) { + c = es[i]; + switch (c) { + case '\r': + // skip past \r\n. sequences + if (*(uint16_t*)(es + i + 1) == UINT16_PACK('\n', '.')) + i += 2; + case '\n': + continue; + case '=': + c = es[i + 1]; + *p++ = c - 42 - 64; + i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case + continue; + default: + *p++ = c - 42; + } + } + if (state) *state = 3; + + if (i == -2) { // 2nd last char + c = es[i]; + switch (c) { + case '\r': + if (state && es[i + 1] == '\n') { + *state = 0; + return p - dest; + } + case '\n': + break; + case '=': + c = es[i + 1]; + *p++ = c - 42 - 64; + i += (c != '\r'); + break; + default: + *p++ = c - 42; + } + i++; + } + + // do final char; we process this separately to prevent an overflow if the final char is '=' + if (i == -1) { + c = es[i]; + if (c != '\n' && c != '\r' && c != '=') { + *p++ = c - 42; + } + else if (state) { + if (c == '=') *state = 1; + else if (c == '\r') *state = 2; + else *state = 3; + } + } + + return p - dest; +} + +} diff --git a/lib/yencode/SimdInit.cpp b/lib/yencode/SimdInit.cpp new file mode 100644 index 00000000..a5309606 --- /dev/null +++ b/lib/yencode/SimdInit.cpp @@ -0,0 +1,141 @@ +/* + * Based on node-yencode library by Anime Tosho: + * https://github.com/animetosho/node-yencode + * + * Copyright (C) 2017 Anime Tosho (animetosho) + * Copyright (C) 2017 Andrey Prygunkov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "nzbget.h" + +#if (defined(__i686__) || defined(__amd64__)) && !defined(WIN32) +#include +#endif + +#include "YEncode.h" + +namespace YEncode +{ + +size_t (*decode)(const unsigned char*, unsigned char*, size_t, char* state) = nullptr; +size_t (*decode_simd)(const unsigned char*, unsigned char*, size_t, char* state) = nullptr; +uint32_t (*crc32_simd)(const unsigned char* src, long len) = nullptr; +uint32_t (*inc_crc32_simd)(uint32_t crc, const unsigned char* src, long len) = nullptr; + +#if defined(__i686__) || defined(__amd64__) +size_t (*decode_sse2)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr; +extern void init_decode_sse2(); +size_t (*decode_ssse3)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr; +extern void init_decode_ssse3(); +uint32_t (*crc32_pclmul)(const unsigned char *src, long len) = nullptr; +extern void init_crc32_pclmul(); + +class CpuId +{ + uint32_t regs[4]; +public: + CpuId(unsigned level) + { +#ifdef WIN32 + __cpuid((int *)regs, (int)level); +#else + __cpuid(level, regs[0], regs[1], regs[2], regs[3]); +#endif + } + const uint32_t &EAX() const {return regs[0];} + const uint32_t &EBX() const {return regs[1];} + const uint32_t &ECX() const {return regs[2];} + const uint32_t &EDX() const {return regs[3];} +}; +#endif + +#if defined(__arm__) || defined(__aarch64__) +size_t (*decode_neon)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr; +extern void init_decode_neon(); +uint32_t (*crc32_arm)(const unsigned char *src, long len) = nullptr; +extern void init_crc32_arm(); +#endif + +void init() +{ + decode = &decode_scalar; + +#if defined(__i686__) || defined(__amd64__) + CpuId cpuid(1); + + bool cpu_supports_sse2 = cpuid.EDX() & 0x04000000; + bool cpu_supports_ssse3 = cpuid.ECX() & 0x00000200; + bool cpu_supports_sse41 = cpuid.ECX() & 0x00080000; + bool cpu_supports_pclmul = cpuid.ECX() & 0x00000002; + + if (cpu_supports_sse2) + { + init_decode_sse2(); + decode_simd = decode_sse2; + } + if (cpu_supports_ssse3) + { + init_decode_ssse3(); + if (decode_ssse3) + { + decode_simd = decode_ssse3; + } + } + if (cpu_supports_sse41 && cpu_supports_pclmul) + { + init_crc32_pclmul(); + crc32_simd = crc32_pclmul; + } +#endif + +#if defined(__arm__) || defined(__aarch64__) + bool cpu_supports_neon = false; + bool cpu_supports_crc = false; + +#ifdef __linux__ + if (FILE* file = fopen("/proc/cpuinfo", "r")) + { + char buf[200]; + while (fgets(buf, sizeof(buf), file)) + { + cpu_supports_neon |= !strncasecmp(buf, "Features", 8) && + (strstr(buf, " neon ") || strstr(buf, " asimd ")); + cpu_supports_crc |= !strncasecmp(buf, "Features", 8) && strstr(buf, " crc32 "); + } + fclose(file); + } +#endif + + if (cpu_supports_neon) + { + init_decode_neon(); + decode_simd = decode_neon; + } + if (cpu_supports_crc) + { + init_crc32_arm(); + crc32_simd = crc32_arm; + } +#endif + + if (decode_simd) + { + decode = decode_simd; + } +} + +} diff --git a/lib/yencode/Sse2Decoder.cpp b/lib/yencode/Sse2Decoder.cpp new file mode 100644 index 00000000..7d7cae3f --- /dev/null +++ b/lib/yencode/Sse2Decoder.cpp @@ -0,0 +1,230 @@ +/* + * Based on node-yencode library by Anime Tosho: + * https://github.com/animetosho/node-yencode + * + * Copyright (C) 2017 Anime Tosho (animetosho) + * Copyright (C) 2017 Andrey Prygunkov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "nzbget.h" + +#include "YEncode.h" + +#ifdef __SSE2__ +#include +#endif + +namespace YEncode +{ +#ifdef __SSE2__ + +// combine two 8-bit ints into a 16-bit one +#define UINT16_PACK(a, b) ((a) | ((b) << 8)) + +#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/ + +#define STOREU_XMM(dest, xmm) \ + _mm_storeu_si128((__m128i*)(dest), xmm) + +#define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \ + _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \ + (b) \ +)) + +uint8_t eqFixLUT[256]; +alignas(32) __m64 eqAddLUT[256]; + +size_t do_decode_sse2(const unsigned char* src, unsigned char* dest, size_t len, char* state) { + if(len <= sizeof(__m128i)*2) return decode_scalar(src, dest, len, state); + + unsigned char *p = dest; // destination pointer + unsigned long i = 0; // input position + unsigned char escFirst = 0; // input character; first char needs escaping + unsigned int nextMask = 0; + char tState = 0; + char* pState = state ? state : &tState; + if((uintptr_t)src & ((sizeof(__m128i)-1))) { + // find source memory alignment + unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(__m128i)-1)) & ~(sizeof(__m128i)-1)); + + i = aSrc - src; + p += decode_scalar(src, dest, i, pState); + } + + if(*pState == 0 && i+1 < len && src[i] == '.') + nextMask = 1; + else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.')) + nextMask = 2; + + escFirst = *pState == 1; + + if(i + (sizeof(__m128i)+1) < len) { + // our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking) + size_t dLen = len - (sizeof(__m128i)+1); + dLen = ((dLen-i) + 0xf) & ~0xf; + unsigned char* dSrc = (unsigned char*)src + dLen + i; + long dI = -dLen; + i += dLen; + + for(; dI; dI += sizeof(__m128i)) { + __m128i data = _mm_load_si128((__m128i *)(dSrc + dI)); + + // search for special chars + __m128i cmpEq = _mm_cmpeq_epi8(data, _mm_set1_epi8('=')), + cmp = _mm_or_si128( + _mm_or_si128( + _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), // \r\n + _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a)) // \n\r + ), + cmpEq + ); + + unsigned int mask = _mm_movemask_epi8(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later + + __m128i oData; + if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers? + // first byte needs escaping due to preceeding = in last loop iteration + oData = _mm_sub_epi8(data, _mm_set_epi8(42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42+64)); + } else { + oData = _mm_sub_epi8(data, _mm_set1_epi8(42)); + } + mask &= ~escFirst; + mask |= nextMask; + + if (mask != 0) { + // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant + // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that + + // firstly, resolve invalid sequences of = to deal with cases like '====' + unsigned int maskEq = _mm_movemask_epi8(cmpEq); + unsigned int tmp = eqFixLUT[(maskEq&0xff) & ~escFirst]; + maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp; + + escFirst = (maskEq >> (sizeof(__m128i)-1)); + // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed + maskEq <<= 1; + mask &= ~maskEq; + + // unescape chars following `=` + oData = _mm_add_epi8( + oData, + LOAD_HALVES( + eqAddLUT + (maskEq&0xff), + eqAddLUT + ((maskEq>>8)&0xff) + ) + ); + + // handle \r\n. sequences + // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing + // find instances of \r\n + __m128i tmpData1, tmpData2; + tmpData1 = _mm_insert_epi16(_mm_srli_si128(data, 1), *(uint16_t*)(dSrc + dI + sizeof(__m128i)-1), 7); + tmpData2 = _mm_insert_epi16(_mm_srli_si128(data, 2), *(uint16_t*)(dSrc + dI + sizeof(__m128i)), 7); + __m128i cmp1 = _mm_cmpeq_epi16(data, _mm_set1_epi16(0x0a0d)); + __m128i cmp2 = _mm_cmpeq_epi16(tmpData1, _mm_set1_epi16(0x0a0d)); + // prepare to merge the two comparisons + cmp1 = _mm_srli_si128(cmp1, 1); + // find all instances of . + tmpData2 = _mm_cmpeq_epi8(tmpData2, _mm_set1_epi8('.')); + // merge matches of \r\n with those for . + unsigned int killDots = _mm_movemask_epi8( + _mm_and_si128(tmpData2, _mm_or_si128(cmp1, cmp2)) + ); + mask |= (killDots << 2) & 0xffff; + nextMask = killDots >> (sizeof(__m128i)-2); + + // all that's left is to 'compress' the data (skip over masked chars) + alignas(32) uint32_t mmTmp[4]; + _mm_store_si128((__m128i*)mmTmp, oData); + + for(int j=0; j<4; j++) { + if(mask & 0xf) { + unsigned char* pMmTmp = (unsigned char*)(mmTmp + j); + unsigned int maskn = ~mask; + *p = pMmTmp[0]; + p += (maskn & 1); + *p = pMmTmp[1]; + p += (maskn & 2) >> 1; + *p = pMmTmp[2]; + p += (maskn & 4) >> 2; + *p = pMmTmp[3]; + p += (maskn & 8) >> 3; + } else { + *(uint32_t*)p = mmTmp[j]; + p += 4; + } + mask >>= 4; + } + } else { + STOREU_XMM(p, oData); + p += XMM_SIZE; + escFirst = 0; + nextMask = 0; + } + } + + if(escFirst) *pState = 1; // escape next character + else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n + else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r + else *pState = 3; + } + + // end alignment + if(i < len) { + p += decode_scalar(src + i, p, len - i, pState); + } + + return p - dest; +} + +extern size_t (*decode_sse2)(const unsigned char* src, unsigned char* dest, size_t len, char* state); +#endif + +void init_decode_sse2() { +#ifdef __SSE2__ + decode_sse2 = &do_decode_sse2; + + // generate unshuf LUT + for(int i=0; i<256; i++) { + int k = i; + uint8_t res[8]; + int p = 0; + + // fix LUT + k = i; + p = 0; + for(int j=0; j<8; j++) { + k = i >> j; + if(k & 1) { + p |= 1 << j; + j++; + } + } + eqFixLUT[i] = p; + + // sub LUT + k = i; + for(int j=0; j<8; j++) { + res[j] = (k & 1) ? 192 /* == -64 */ : 0; + k >>= 1; + } + _mm_storel_epi64((__m128i*)(eqAddLUT + i), _mm_loadl_epi64((__m128i*)res)); + } +#endif +} + +} diff --git a/lib/yencode/Ssse3Decoder.cpp b/lib/yencode/Ssse3Decoder.cpp new file mode 100644 index 00000000..172bda4b --- /dev/null +++ b/lib/yencode/Ssse3Decoder.cpp @@ -0,0 +1,243 @@ +/* + * Based on node-yencode library by Anime Tosho: + * https://github.com/animetosho/node-yencode + * + * Copyright (C) 2017 Anime Tosho (animetosho) + * Copyright (C) 2017 Andrey Prygunkov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "nzbget.h" + +#include "YEncode.h" + +#ifdef __SSSE3__ +#include +#endif + +namespace YEncode +{ +#ifdef __SSSE3__ + +// combine two 8-bit ints into a 16-bit one +#define UINT16_PACK(a, b) ((a) | ((b) << 8)) + +#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/ + +#define STOREU_XMM(dest, xmm) \ + _mm_storeu_si128((__m128i*)(dest), xmm) + +#define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \ + _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \ + (b) \ +)) + +// table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable +static const unsigned char BitsSetTable256[256] = +{ +# define B2(n) n, n+1, n+1, n+2 +# define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2) +# define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2) + B6(0), B6(1), B6(1), B6(2) +#undef B2 +#undef B4 +#undef B6 +}; + +extern uint8_t eqFixLUT[256]; +extern __m64 eqAddLUT[256]; + +alignas(32)__m64 unshufLUT[256]; +alignas(32) static const uint8_t _pshufb_combine_table[272] = { + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80, + 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80, + 0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80, + 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80, + 0x00,0x01,0x02,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80, + 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80, + 0x00,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80, + 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, +}; +static const __m128i* pshufb_combine_table = (const __m128i*)_pshufb_combine_table; + +size_t do_decode_ssse3(const unsigned char* src, unsigned char* dest, size_t len, char* state) { + if(len <= sizeof(__m128i)*2) return decode_scalar(src, dest, len, state); + + unsigned char *p = dest; // destination pointer + unsigned long i = 0; // input position + unsigned char escFirst = 0; // input character; first char needs escaping + unsigned int nextMask = 0; + char tState = 0; + char* pState = state ? state : &tState; + if((uintptr_t)src & ((sizeof(__m128i)-1))) { + // find source memory alignment + unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(__m128i)-1)) & ~(sizeof(__m128i)-1)); + + i = aSrc - src; + p += decode_scalar(src, dest, i, pState); + } + + // handle finicky case of \r\n. straddled across initial boundary + if(*pState == 0 && i+1 < len && src[i] == '.') + nextMask = 1; + else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.')) + nextMask = 2; + + escFirst = *pState == 1; + + if(i + (sizeof(__m128i)+1) < len) { + // our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking) + size_t dLen = len - (sizeof(__m128i)+1); + dLen = ((dLen-i) + 0xf) & ~0xf; + unsigned char* dSrc = (unsigned char*)src + dLen + i; + long dI = -dLen; + i += dLen; + + for(; dI; dI += sizeof(__m128i)) { + __m128i data = _mm_load_si128((__m128i *)(dSrc + dI)); + + // search for special chars + __m128i cmpEq = _mm_cmpeq_epi8(data, _mm_set1_epi8('=')), + cmp = _mm_or_si128( + _mm_or_si128( + _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), // \r\n + _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a)) // \n\r + ), + cmpEq + ); + + unsigned int mask = _mm_movemask_epi8(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later + + __m128i oData; + if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers? + // first byte needs escaping due to preceeding = in last loop iteration + oData = _mm_sub_epi8(data, _mm_set_epi8(42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42+64)); + } else { + oData = _mm_sub_epi8(data, _mm_set1_epi8(42)); + } + mask &= ~escFirst; + mask |= nextMask; + + if (mask != 0) { + // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant + // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that + + // firstly, resolve invalid sequences of = to deal with cases like '====' + unsigned int maskEq = _mm_movemask_epi8(cmpEq); + unsigned int tmp = eqFixLUT[(maskEq&0xff) & ~escFirst]; + maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp; + + escFirst = (maskEq >> (sizeof(__m128i)-1)); + // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed + maskEq <<= 1; + mask &= ~maskEq; + + // unescape chars following `=` + oData = _mm_add_epi8( + oData, + LOAD_HALVES( + eqAddLUT + (maskEq&0xff), + eqAddLUT + ((maskEq>>8)&0xff) + ) + ); + + // handle \r\n. sequences + // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing + // find instances of \r\n + __m128i tmpData1, tmpData2; + __m128i nextData = _mm_load_si128((__m128i *)(dSrc + dI) + 1); + tmpData1 = _mm_alignr_epi8(nextData, data, 1); + tmpData2 = _mm_alignr_epi8(nextData, data, 2); + __m128i cmp1 = _mm_cmpeq_epi16(data, _mm_set1_epi16(0x0a0d)); + __m128i cmp2 = _mm_cmpeq_epi16(tmpData1, _mm_set1_epi16(0x0a0d)); + // prepare to merge the two comparisons + cmp1 = _mm_srli_si128(cmp1, 1); + // find all instances of . + tmpData2 = _mm_cmpeq_epi8(tmpData2, _mm_set1_epi8('.')); + // merge matches of \r\n with those for . + unsigned int killDots = _mm_movemask_epi8( + _mm_and_si128(tmpData2, _mm_or_si128(cmp1, cmp2)) + ); + mask |= (killDots << 2) & 0xffff; + nextMask = killDots >> (sizeof(__m128i)-2); + + // all that's left is to 'compress' the data (skip over masked chars) + unsigned char skipped = BitsSetTable256[mask & 0xff]; + // lookup compress masks and shuffle + // load up two halves + __m128i shuf = LOAD_HALVES(unshufLUT + (mask&0xff), unshufLUT + (mask>>8)); + + // offset upper half by 8 + shuf = _mm_add_epi8(shuf, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); + // shift down upper half into lower + // TODO: consider using `mask & 0xff` in table instead of counting bits + shuf = _mm_shuffle_epi8(shuf, _mm_load_si128(pshufb_combine_table + skipped)); + + // shuffle data + oData = _mm_shuffle_epi8(oData, shuf); + STOREU_XMM(p, oData); + + // increment output position + p += XMM_SIZE - skipped - BitsSetTable256[mask >> 8]; + } else { + STOREU_XMM(p, oData); + p += XMM_SIZE; + escFirst = 0; + nextMask = 0; + } + } + + if(escFirst) *pState = 1; // escape next character + else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n + else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r + else *pState = 3; + } + + // end alignment + if(i < len) { + p += decode_scalar(src + i, p, len - i, pState); + } + + return p - dest; +} + +extern size_t (*decode_ssse3)(const unsigned char* src, unsigned char* dest, size_t len, char* state); +#endif + +void init_decode_ssse3() { +#ifdef __SSSE3__ + decode_ssse3 = do_decode_ssse3; + + // generate unshuf LUT + for(int i=0; i<256; i++) { + int k = i; + uint8_t res[8]; + int p = 0; + for(int j=0; j<8; j++) { + if(!(k & 1)) { + res[p++] = j; + } + k >>= 1; + } + for(; p<8; p++) + res[p] = 0; + _mm_storel_epi64((__m128i*)(unshufLUT + i), _mm_loadl_epi64((__m128i*)res)); + } +#endif +} + +} diff --git a/lib/yencode/YEncode.h b/lib/yencode/YEncode.h new file mode 100644 index 00000000..774614a4 --- /dev/null +++ b/lib/yencode/YEncode.h @@ -0,0 +1,38 @@ +/* + * Based on node-yencode library by Anime Tosho: + * https://github.com/animetosho/node-yencode + * + * Copyright (C) 2017 Anime Tosho (animetosho) + * Copyright (C) 2017 Andrey Prygunkov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#ifndef YENCODE_H +#define YENCODE_H + +namespace YEncode +{ + +void init(); +extern size_t (*decode)(const unsigned char* inbuf, unsigned char* outbuf, size_t, char* state); +extern size_t (*decode_simd)(const unsigned char* inbuf, unsigned char* outbuf, size_t, char* state); +size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len, char* state); +extern uint32_t (*crc32_simd)(const unsigned char* src, long len); +extern uint32_t (*inc_crc32_simd)(uint32_t crc, const unsigned char* src, long len); + +} + +#endif diff --git a/nzbget.vcxproj b/nzbget.vcxproj index edaf80d6..72d9a6b6 100755 --- a/nzbget.vcxproj +++ b/nzbget.vcxproj @@ -79,7 +79,7 @@ Disabled - .\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\windows\resources;%(AdditionalIncludeDirectories) + .\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\lib\yencode;.\windows\resources;%(AdditionalIncludeDirectories) WIN32;PACKAGE="nzbget";VERSION="20.0-testing";_DEBUG;_CONSOLE;DEBUG;_WIN32_WINNT=0x0403;%(PreprocessorDefinitions) false EnableFastChecks @@ -100,7 +100,7 @@ Disabled - .\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\windows\resources;%(AdditionalIncludeDirectories) + .\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\lib\yencode;.\windows\resources;%(AdditionalIncludeDirectories) WIN32;PACKAGE="nzbget";VERSION="20.0-testing";_DEBUG;_CONSOLE;DEBUG;_WIN32_WINNT=0x0403;%(PreprocessorDefinitions) false EnableFastChecks @@ -119,7 +119,7 @@ - .\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\windows\resources;%(AdditionalIncludeDirectories) + .\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\lib\yencode;.\windows\resources;%(AdditionalIncludeDirectories) WIN32;PACKAGE="nzbget";VERSION="20.0-testing";NDEBUG;_CONSOLE;_WIN32_WINNT=0x0403;%(PreprocessorDefinitions) Sync MultiThreaded @@ -150,7 +150,7 @@ - .\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\windows\resources;%(AdditionalIncludeDirectories) + .\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\lib\yencode;.\windows\resources;%(AdditionalIncludeDirectories) WIN32;PACKAGE="nzbget";VERSION="20.0-testing";NDEBUG;_CONSOLE;_WIN32_WINNT=0x0403;%(PreprocessorDefinitions) Sync MultiThreaded @@ -253,10 +253,7 @@ - Create - Create - Create - Create + Create @@ -279,6 +276,11 @@ + + + + + @@ -379,6 +381,7 @@ +