From 69a0db63f61bf37afaa47b08d874c494214deea7 Mon Sep 17 00:00:00 2001
From: Andrey Prygunkov <hugbug@users.sourceforge.net>
Date: Sun, 8 Oct 2017 20:49:13 +0200
Subject: [PATCH] #454: integrated node-yencode library by Anime Tosho

1) integrated the library; 2) splitted units by CPU architecture; 3)
extended makefile and configure script to detect CPU architecture and
use appropriate compiler flags; 4) runtime CPU features detection for
x86 and ARM with dynamic code  dispatching; 5) temporary (for test
purposes) printing info about SIMD support to stdout on program
startup; 6) new SIMD routines are not yet used in the program
---
 Makefile.am                   |  20 +-
 Makefile.in                   |  69 +++++-
 configure                     |  93 ++++---
 configure.ac                  |  51 +++-
 daemon/main/nzbget.cpp        |   1 +
 daemon/main/nzbget.h          |  13 +-
 daemon/nntp/Decoder.cpp       |  14 ++
 daemon/nntp/Decoder.h         |   1 +
 lib/yencode/ArmCrc.cpp        |  94 +++++++
 lib/yencode/NeonDecoder.cpp   | 272 +++++++++++++++++++++
 lib/yencode/PclmulCrc.cpp     | 444 ++++++++++++++++++++++++++++++++++
 lib/yencode/ScalarDecoder.cpp | 130 ++++++++++
 lib/yencode/SimdInit.cpp      | 141 +++++++++++
 lib/yencode/Sse2Decoder.cpp   | 230 ++++++++++++++++++
 lib/yencode/Ssse3Decoder.cpp  | 243 +++++++++++++++++++
 lib/yencode/YEncode.h         |  38 +++
 nzbget.vcxproj                |  19 +-
 17 files changed, 1796 insertions(+), 77 deletions(-)
 create mode 100644 lib/yencode/ArmCrc.cpp
 create mode 100644 lib/yencode/NeonDecoder.cpp
 create mode 100644 lib/yencode/PclmulCrc.cpp
 create mode 100644 lib/yencode/ScalarDecoder.cpp
 create mode 100644 lib/yencode/SimdInit.cpp
 create mode 100644 lib/yencode/Sse2Decoder.cpp
 create mode 100644 lib/yencode/Ssse3Decoder.cpp
 create mode 100644 lib/yencode/YEncode.h

diff --git a/Makefile.am b/Makefile.am
index 346b3906..b1af410e 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -214,6 +214,23 @@ nzbget_SOURCES += \
 	lib/par2/verificationpacket.h
 endif
 
+# Simd decoder and Crc32
+nzbget_SOURCES += \
+	lib/yencode/YEncode.h \
+	lib/yencode/SimdInit.cpp \
+	lib/yencode/ScalarDecoder.cpp \
+	lib/yencode/Sse2Decoder.cpp \
+	lib/yencode/Ssse3Decoder.cpp \
+	lib/yencode/PclmulCrc.cpp \
+	lib/yencode/NeonDecoder.cpp \
+	lib/yencode/ArmCrc.cpp
+
+lib/yencode/Sse2Decoder.$(OBJEXT) : CXXFLAGS+=$(SSE2_CXXFLAGS)
+lib/yencode/Ssse3Decoder.$(OBJEXT) : CXXFLAGS+=$(SSSE3_CXXFLAGS)
+lib/yencode/PclmulCrc.$(OBJEXT) : CXXFLAGS+=$(PCLMUL_CXXFLAGS)
+lib/yencode/NeonDecoder.$(OBJEXT) : CXXFLAGS+=$(NEON_CXXFLAGS)
+lib/yencode/ArmCrc.$(OBJEXT) : CXXFLAGS+=$(ARMCRC_CXXFLAGS)
+
 AM_CPPFLAGS = \
 	-I$(srcdir)/daemon/connect \
 	-I$(srcdir)/daemon/extension \
@@ -226,7 +243,8 @@ AM_CPPFLAGS = \
 	-I$(srcdir)/daemon/remote \
 	-I$(srcdir)/daemon/util \
 	-I$(srcdir)/daemon/nserv \
-	-I$(srcdir)/lib/par2
+	-I$(srcdir)/lib/par2 \
+	-I$(srcdir)/lib/yencode
 
 if WITH_TESTS
 nzbget_SOURCES += \
diff --git a/Makefile.in b/Makefile.in
index bb8743cc..27836422 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -312,6 +312,10 @@ am__nzbget_SOURCES_DIST = daemon/connect/Connection.cpp \
 	lib/par2/reedsolomon.h lib/par2/verificationhashtable.cpp \
 	lib/par2/verificationhashtable.h \
 	lib/par2/verificationpacket.cpp lib/par2/verificationpacket.h \
+	lib/yencode/YEncode.h lib/yencode/SimdInit.cpp \
+	lib/yencode/ScalarDecoder.cpp lib/yencode/Sse2Decoder.cpp \
+	lib/yencode/Ssse3Decoder.cpp lib/yencode/PclmulCrc.cpp \
+	lib/yencode/NeonDecoder.cpp lib/yencode/ArmCrc.cpp \
 	lib/catch/catch.h tests/suite/TestMain.cpp \
 	tests/suite/TestMain.h tests/suite/TestUtil.cpp \
 	tests/suite/TestUtil.h tests/main/CommandLineParserTest.cpp \
@@ -425,7 +429,13 @@ am_nzbget_OBJECTS = daemon/connect/Connection.$(OBJEXT) \
 	daemon/nserv/NntpServer.$(OBJEXT) \
 	daemon/nserv/NzbGenerator.$(OBJEXT) \
 	daemon/nserv/YEncoder.$(OBJEXT) code_revision.$(OBJEXT) \
-	$(am__objects_1) $(am__objects_2) $(am__objects_3)
+	$(am__objects_1) lib/yencode/SimdInit.$(OBJEXT) \
+	lib/yencode/ScalarDecoder.$(OBJEXT) \
+	lib/yencode/Sse2Decoder.$(OBJEXT) \
+	lib/yencode/Ssse3Decoder.$(OBJEXT) \
+	lib/yencode/PclmulCrc.$(OBJEXT) \
+	lib/yencode/NeonDecoder.$(OBJEXT) lib/yencode/ArmCrc.$(OBJEXT) \
+	$(am__objects_2) $(am__objects_3)
 nzbget_OBJECTS = $(am_nzbget_OBJECTS)
 nzbget_LDADD = $(LDADD)
 am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
@@ -553,6 +563,7 @@ am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \
 ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+ARMCRC_CXXFLAGS = @ARMCRC_CXXFLAGS@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
@@ -585,6 +596,7 @@ MAINT = @MAINT@
 MAKE = @MAKE@
 MAKEINFO = @MAKEINFO@
 MKDIR_P = @MKDIR_P@
+NEON_CXXFLAGS = @NEON_CXXFLAGS@
 OBJEXT = @OBJEXT@
 PACKAGE = @PACKAGE@
 PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
@@ -594,11 +606,14 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
+PCLMUL_CXXFLAGS = @PCLMUL_CXXFLAGS@
 PKG_CONFIG = @PKG_CONFIG@
 PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
 PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
+SSE2_CXXFLAGS = @SSE2_CXXFLAGS@
+SSSE3_CXXFLAGS = @SSSE3_CXXFLAGS@
 STRIP = @STRIP@
 TAR = @TAR@
 VERSION = @VERSION@
@@ -668,6 +683,8 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 zlib_CFLAGS = @zlib_CFLAGS@
 zlib_LIBS = @zlib_LIBS@
+
+# Simd decoder and Crc32
 nzbget_SOURCES = daemon/connect/Connection.cpp \
 	daemon/connect/Connection.h daemon/connect/TlsSocket.cpp \
 	daemon/connect/TlsSocket.h daemon/connect/WebDownloader.cpp \
@@ -760,14 +777,18 @@ nzbget_SOURCES = daemon/connect/Connection.cpp \
 	daemon/nserv/NntpServer.h daemon/nserv/NntpServer.cpp \
 	daemon/nserv/NzbGenerator.h daemon/nserv/NzbGenerator.cpp \
 	daemon/nserv/YEncoder.h daemon/nserv/YEncoder.cpp \
-	code_revision.cpp $(am__append_1) $(am__append_2) \
-	$(am__append_3)
+	code_revision.cpp $(am__append_1) lib/yencode/YEncode.h \
+	lib/yencode/SimdInit.cpp lib/yencode/ScalarDecoder.cpp \
+	lib/yencode/Sse2Decoder.cpp lib/yencode/Ssse3Decoder.cpp \
+	lib/yencode/PclmulCrc.cpp lib/yencode/NeonDecoder.cpp \
+	lib/yencode/ArmCrc.cpp $(am__append_2) $(am__append_3)
 AM_CPPFLAGS = -I$(srcdir)/daemon/connect -I$(srcdir)/daemon/extension \
 	-I$(srcdir)/daemon/feed -I$(srcdir)/daemon/frontend \
 	-I$(srcdir)/daemon/main -I$(srcdir)/daemon/nntp \
 	-I$(srcdir)/daemon/postprocess -I$(srcdir)/daemon/queue \
 	-I$(srcdir)/daemon/remote -I$(srcdir)/daemon/util \
-	-I$(srcdir)/daemon/nserv -I$(srcdir)/lib/par2 $(am__append_4)
+	-I$(srcdir)/daemon/nserv -I$(srcdir)/lib/par2 \
+	-I$(srcdir)/lib/yencode $(am__append_4)
 EXTRA_DIST = \
 	$(windows_FILES) \
 	$(osx_FILES) \
@@ -1313,6 +1334,26 @@ lib/par2/verificationhashtable.$(OBJEXT): lib/par2/$(am__dirstamp) \
 	lib/par2/$(DEPDIR)/$(am__dirstamp)
 lib/par2/verificationpacket.$(OBJEXT): lib/par2/$(am__dirstamp) \
 	lib/par2/$(DEPDIR)/$(am__dirstamp)
+lib/yencode/$(am__dirstamp):
+	@$(MKDIR_P) lib/yencode
+	@: > lib/yencode/$(am__dirstamp)
+lib/yencode/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) lib/yencode/$(DEPDIR)
+	@: > lib/yencode/$(DEPDIR)/$(am__dirstamp)
+lib/yencode/SimdInit.$(OBJEXT): lib/yencode/$(am__dirstamp) \
+	lib/yencode/$(DEPDIR)/$(am__dirstamp)
+lib/yencode/ScalarDecoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \
+	lib/yencode/$(DEPDIR)/$(am__dirstamp)
+lib/yencode/Sse2Decoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \
+	lib/yencode/$(DEPDIR)/$(am__dirstamp)
+lib/yencode/Ssse3Decoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \
+	lib/yencode/$(DEPDIR)/$(am__dirstamp)
+lib/yencode/PclmulCrc.$(OBJEXT): lib/yencode/$(am__dirstamp) \
+	lib/yencode/$(DEPDIR)/$(am__dirstamp)
+lib/yencode/NeonDecoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \
+	lib/yencode/$(DEPDIR)/$(am__dirstamp)
+lib/yencode/ArmCrc.$(OBJEXT): lib/yencode/$(am__dirstamp) \
+	lib/yencode/$(DEPDIR)/$(am__dirstamp)
 tests/suite/$(am__dirstamp):
 	@$(MKDIR_P) tests/suite
 	@: > tests/suite/$(am__dirstamp)
@@ -1455,6 +1496,7 @@ mostlyclean-compile:
 	-rm -f daemon/remote/*.$(OBJEXT)
 	-rm -f daemon/util/*.$(OBJEXT)
 	-rm -f lib/par2/*.$(OBJEXT)
+	-rm -f lib/yencode/*.$(OBJEXT)
 	-rm -f tests/feed/*.$(OBJEXT)
 	-rm -f tests/main/*.$(OBJEXT)
 	-rm -f tests/nntp/*.$(OBJEXT)
@@ -1559,6 +1601,13 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@lib/par2/$(DEPDIR)/reedsolomon.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lib/par2/$(DEPDIR)/verificationhashtable.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lib/par2/$(DEPDIR)/verificationpacket.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/ArmCrc.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/NeonDecoder.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/PclmulCrc.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/ScalarDecoder.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/SimdInit.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/Sse2Decoder.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/Ssse3Decoder.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@tests/feed/$(DEPDIR)/FeedFilterTest.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@tests/main/$(DEPDIR)/CommandLineParserTest.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@tests/main/$(DEPDIR)/OptionsTest.Po@am__quote@
@@ -1940,6 +1989,8 @@ distclean-generic:
 	-rm -f daemon/util/$(am__dirstamp)
 	-rm -f lib/par2/$(DEPDIR)/$(am__dirstamp)
 	-rm -f lib/par2/$(am__dirstamp)
+	-rm -f lib/yencode/$(DEPDIR)/$(am__dirstamp)
+	-rm -f lib/yencode/$(am__dirstamp)
 	-rm -f tests/feed/$(DEPDIR)/$(am__dirstamp)
 	-rm -f tests/feed/$(am__dirstamp)
 	-rm -f tests/main/$(DEPDIR)/$(am__dirstamp)
@@ -1964,7 +2015,7 @@ clean-am: clean-binPROGRAMS clean-generic mostlyclean-am
 
 distclean: distclean-am
 	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
-	-rm -rf ./$(DEPDIR) daemon/connect/$(DEPDIR) daemon/extension/$(DEPDIR) daemon/feed/$(DEPDIR) daemon/frontend/$(DEPDIR) daemon/main/$(DEPDIR) daemon/nntp/$(DEPDIR) daemon/nserv/$(DEPDIR) daemon/postprocess/$(DEPDIR) daemon/queue/$(DEPDIR) daemon/remote/$(DEPDIR) daemon/util/$(DEPDIR) lib/par2/$(DEPDIR) tests/feed/$(DEPDIR) tests/main/$(DEPDIR) tests/nntp/$(DEPDIR) tests/postprocess/$(DEPDIR) tests/queue/$(DEPDIR) tests/suite/$(DEPDIR) tests/util/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) daemon/connect/$(DEPDIR) daemon/extension/$(DEPDIR) daemon/feed/$(DEPDIR) daemon/frontend/$(DEPDIR) daemon/main/$(DEPDIR) daemon/nntp/$(DEPDIR) daemon/nserv/$(DEPDIR) daemon/postprocess/$(DEPDIR) daemon/queue/$(DEPDIR) daemon/remote/$(DEPDIR) daemon/util/$(DEPDIR) lib/par2/$(DEPDIR) lib/yencode/$(DEPDIR) tests/feed/$(DEPDIR) tests/main/$(DEPDIR) tests/nntp/$(DEPDIR) tests/postprocess/$(DEPDIR) tests/queue/$(DEPDIR) tests/suite/$(DEPDIR) tests/util/$(DEPDIR)
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-hdr distclean-tags
@@ -2015,7 +2066,7 @@ installcheck-am:
 maintainer-clean: maintainer-clean-am
 	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
 	-rm -rf $(top_srcdir)/autom4te.cache
-	-rm -rf ./$(DEPDIR) daemon/connect/$(DEPDIR) daemon/extension/$(DEPDIR) daemon/feed/$(DEPDIR) daemon/frontend/$(DEPDIR) daemon/main/$(DEPDIR) daemon/nntp/$(DEPDIR) daemon/nserv/$(DEPDIR) daemon/postprocess/$(DEPDIR) daemon/queue/$(DEPDIR) daemon/remote/$(DEPDIR) daemon/util/$(DEPDIR) lib/par2/$(DEPDIR) tests/feed/$(DEPDIR) tests/main/$(DEPDIR) tests/nntp/$(DEPDIR) tests/postprocess/$(DEPDIR) tests/queue/$(DEPDIR) tests/suite/$(DEPDIR) tests/util/$(DEPDIR)
+	-rm -rf ./$(DEPDIR) daemon/connect/$(DEPDIR) daemon/extension/$(DEPDIR) daemon/feed/$(DEPDIR) daemon/frontend/$(DEPDIR) daemon/main/$(DEPDIR) daemon/nntp/$(DEPDIR) daemon/nserv/$(DEPDIR) daemon/postprocess/$(DEPDIR) daemon/queue/$(DEPDIR) daemon/remote/$(DEPDIR) daemon/util/$(DEPDIR) lib/par2/$(DEPDIR) lib/yencode/$(DEPDIR) tests/feed/$(DEPDIR) tests/main/$(DEPDIR) tests/nntp/$(DEPDIR) tests/postprocess/$(DEPDIR) tests/queue/$(DEPDIR) tests/suite/$(DEPDIR) tests/util/$(DEPDIR)
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
@@ -2064,6 +2115,12 @@ uninstall-am: uninstall-binPROGRAMS uninstall-dist_docDATA \
 .PRECIOUS: Makefile
 
 
+lib/yencode/Sse2Decoder.$(OBJEXT) : CXXFLAGS+=$(SSE2_CXXFLAGS)
+lib/yencode/Ssse3Decoder.$(OBJEXT) : CXXFLAGS+=$(SSSE3_CXXFLAGS)
+lib/yencode/PclmulCrc.$(OBJEXT) : CXXFLAGS+=$(PCLMUL_CXXFLAGS)
+lib/yencode/NeonDecoder.$(OBJEXT) : CXXFLAGS+=$(NEON_CXXFLAGS)
+lib/yencode/ArmCrc.$(OBJEXT) : CXXFLAGS+=$(ARMCRC_CXXFLAGS)
+
 # Note about "sed": 
 # We need to make some changes in installed files.
 # On Linux "sed" has option "-i" for in-place-edit. Unfortunateley the BSD version of "sed"
diff --git a/configure b/configure
index 1fec8ccd..bf7bc702 100755
--- a/configure
+++ b/configure
@@ -628,6 +628,11 @@ LTLIBOBJS
 LIBOBJS
 WITH_TESTS_FALSE
 WITH_TESTS_TRUE
+ARMCRC_CXXFLAGS
+NEON_CXXFLAGS
+PCLMUL_CXXFLAGS
+SSSE3_CXXFLAGS
+SSE2_CXXFLAGS
 zlib_LIBS
 zlib_CFLAGS
 nettle_LIBS
@@ -5756,48 +5761,13 @@ fi
 done
 
 
-for ac_header in sys/prctl.h
+for ac_header in sys/prctl.h regex.h endian.h getopt.h
 do :
-  ac_fn_cxx_check_header_mongrel "$LINENO" "sys/prctl.h" "ac_cv_header_sys_prctl_h" "$ac_includes_default"
-if test "x$ac_cv_header_sys_prctl_h" = xyes; then :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_cxx_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
   cat >>confdefs.h <<_ACEOF
-#define HAVE_SYS_PRCTL_H 1
-_ACEOF
-
-fi
-
-done
-
-for ac_header in regex.h
-do :
-  ac_fn_cxx_check_header_mongrel "$LINENO" "regex.h" "ac_cv_header_regex_h" "$ac_includes_default"
-if test "x$ac_cv_header_regex_h" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_REGEX_H 1
-_ACEOF
-
-fi
-
-done
-
-for ac_header in endian.h
-do :
-  ac_fn_cxx_check_header_mongrel "$LINENO" "endian.h" "ac_cv_header_endian_h" "$ac_includes_default"
-if test "x$ac_cv_header_endian_h" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_ENDIAN_H 1
-_ACEOF
-
-fi
-
-done
-
-for ac_header in getopt.h
-do :
-  ac_fn_cxx_check_header_mongrel "$LINENO" "getopt.h" "ac_cv_header_getopt_h" "$ac_includes_default"
-if test "x$ac_cv_header_getopt_h" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_GETOPT_H 1
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
 _ACEOF
 
 fi
@@ -6569,12 +6539,12 @@ main ()
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: size_t" >&5
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: size_t" >&5
 $as_echo "size_t" >&6; }
-    SOCKLEN_T=size_t
+	SOCKLEN_T=size_t
 else
 
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
 #include <stddef.h>
@@ -6591,14 +6561,14 @@ main ()
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
 
-      { $as_echo "$as_me:${as_lineno-$LINENO}: result: int" >&5
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: int" >&5
 $as_echo "int" >&6; }
-      SOCKLEN_T=int
+	SOCKLEN_T=int
 else
 
-      { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: could not determine" >&5
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: could not determine" >&5
 $as_echo "$as_me: WARNING: could not determine" >&2;}
-      SOCKLEN_T=int
+	SOCKLEN_T=int
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 fi
@@ -8366,6 +8336,35 @@ $as_echo "#define DISABLE_GZIP 1" >>confdefs.h
 fi
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use SIMD-optimized routines" >&5
+$as_echo_n "checking whether to use SIMD-optimized routines... " >&6; }
+USE_SIMD=no
+case $host_cpu in
+	i?86|x86_64)
+		SSE2_CXXFLAGS="-msse2"
+		SSSE3_CXXFLAGS="-mssse3"
+		PCLMUL_CXXFLAGS="-msse4.1 -mpclmul"
+		USE_SIMD=yes
+		;;
+	arm)
+		NEON_CXXFLAGS="-mfpu=neon"
+		ARMCRC_CXXFLAGS="-march=armv8-a+crc"
+		USE_SIMD=yes
+		;;
+	aarch64)
+		ARMCRC_CXXFLAGS="-march=armv8-a+crc"
+		USE_SIMD=yes
+		;;
+esac
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $USE_SIMD" >&5
+$as_echo "$USE_SIMD" >&6; }
+
+
+
+
+
+
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use an empty SIGCHLD handler" >&5
 $as_echo_n "checking whether to use an empty SIGCHLD handler... " >&6; }
 # Check whether --enable-sigchld-handler was given.
diff --git a/configure.ac b/configure.ac
index 60c05819..31567676 100644
--- a/configure.ac
+++ b/configure.ac
@@ -65,10 +65,7 @@ fi
 dnl
 dnl Checks for header files.
 dnl
-AC_CHECK_HEADERS(sys/prctl.h)
-AC_CHECK_HEADERS(regex.h)
-AC_CHECK_HEADERS(endian.h)
-AC_CHECK_HEADERS(getopt.h)
+AC_CHECK_HEADERS(sys/prctl.h regex.h endian.h getopt.h)
 
 
 dnl
@@ -148,7 +145,7 @@ if test "$FOUND" = "no"; then
 		[	char* szHost; struct hostent hinfobuf; char* strbuf; int h_errnop;
 			struct hostent* hinfo = gethostbyname_r(szHost, &hinfobuf, strbuf, 1024, &h_errnop); ],
 		AC_MSG_RESULT([[yes, and it takes 5 arguments]])
-		FOUND="yes"                      
+		FOUND="yes"
 		AC_DEFINE([HAVE_GETHOSTBYNAME_R_5], 1, [Define to 1 if gethostbyname_r takes 5 arguments]),
 		FOUND="no")
 	
@@ -198,17 +195,17 @@ AC_TRY_COMPILE([
 #include <sys/types.h>
 #include <sys/socket.h>],[
 (void)getsockopt (1, 1, 1, NULL, (size_t*)NULL)],[
-    AC_MSG_RESULT(size_t)
-    SOCKLEN_T=size_t],[
-    AC_TRY_COMPILE([
+	AC_MSG_RESULT(size_t)
+	SOCKLEN_T=size_t],[
+	AC_TRY_COMPILE([
 #include <stddef.h>
 #include <sys/types.h>
 #include <sys/socket.h>],[
 (void)getsockopt (1, 1, 1, NULL, (int*)NULL)],[
-      AC_MSG_RESULT(int)
-      SOCKLEN_T=int],[
-      AC_MSG_WARN(could not determine)
-      SOCKLEN_T=int])])])
+	AC_MSG_RESULT(int)
+	SOCKLEN_T=int],[
+	AC_MSG_WARN(could not determine)
+	SOCKLEN_T=int])])])
 AC_DEFINE_UNQUOTED(SOCKLEN_T, $SOCKLEN_T, [Determine what socket length (socklen_t) data type is])
 
 
@@ -540,6 +537,36 @@ else
 fi
 
 
+dnl
+dnl Determine if CPU supports SIMD instructions
+dnl
+AC_MSG_CHECKING(whether to use SIMD-optimized routines)
+USE_SIMD=no
+case $host_cpu in
+	i?86|x86_64)
+		SSE2_CXXFLAGS="-msse2"
+		SSSE3_CXXFLAGS="-mssse3"
+		PCLMUL_CXXFLAGS="-msse4.1 -mpclmul"
+		USE_SIMD=yes
+		;;
+	arm)
+		NEON_CXXFLAGS="-mfpu=neon"
+		ARMCRC_CXXFLAGS="-march=armv8-a+crc"
+		USE_SIMD=yes
+		;;
+	aarch64)
+		ARMCRC_CXXFLAGS="-march=armv8-a+crc"
+		USE_SIMD=yes
+		;;
+esac
+AC_MSG_RESULT($USE_SIMD)
+AC_SUBST([SSE2_CXXFLAGS])
+AC_SUBST([SSSE3_CXXFLAGS])
+AC_SUBST([PCLMUL_CXXFLAGS])
+AC_SUBST([NEON_CXXFLAGS])
+AC_SUBST([ARMCRC_CXXFLAGS])
+
+
 dnl 
 dnl Some Linux systems require an empty signal handler for SIGCHLD
 dnl in order for exit codes to be correctly delivered to parent process.
diff --git a/daemon/main/nzbget.cpp b/daemon/main/nzbget.cpp
index e6f5e000..b71b939b 100644
--- a/daemon/main/nzbget.cpp
+++ b/daemon/main/nzbget.cpp
@@ -262,6 +262,7 @@ void NZBGet::Init()
 #ifndef DISABLE_TLS
 		TlsSocket::Init();
 #endif
+		Decoder::Init();
 	}
 
 	CreateGlobals();
diff --git a/daemon/main/nzbget.h b/daemon/main/nzbget.h
index 43eba745..b93bbb80 100644
--- a/daemon/main/nzbget.h
+++ b/daemon/main/nzbget.h
@@ -59,9 +59,6 @@ compiled */
 /* Define to 1 if variadic macros are supported */
 #define HAVE_VARIADIC_MACROS
 
-/* Define to 1 if libpar2 supports cancelling (needs a special patch) */
-#define HAVE_PAR2_CANCEL
-
 /* Define to 1 if function GetAddrInfo is supported */
 #define HAVE_GETADDRINFO
 
@@ -95,6 +92,12 @@ compiled */
 #define _WIN32_WINNT 0x0501
 #endif
 
+#ifdef _WIN64
+#define __amd64__
+#else
+#define __i686__
+#endif
+
 #ifdef _DEBUG
 // detection of memory leaks
 #define _CRTDBG_MAP_ALLOC
@@ -312,6 +315,10 @@ typedef int pid_t;
 #define FOPEN_WB "wbN"
 #define FOPEN_AB "abN"
 
+#define __SSE2__
+#define __SSSE3__
+#define __PCLMUL__
+
 #ifdef DEBUG
 // redefine "exit" to avoid printing memory leaks report when terminated because of wrong command line switches
 #define exit(code) ExitProcess(code)
diff --git a/daemon/nntp/Decoder.cpp b/daemon/nntp/Decoder.cpp
index bff055f7..ced8aaf1 100644
--- a/daemon/nntp/Decoder.cpp
+++ b/daemon/nntp/Decoder.cpp
@@ -22,9 +22,23 @@
 #include "Decoder.h"
 #include "Log.h"
 #include "Util.h"
+#include "YEncode.h"
 
 const char* Decoder::FormatNames[] = { "Unknown", "yEnc", "UU" };
 
+void Decoder::Init()
+{
+	YEncode::init();
+
+	debug("%s", YEncode::decode_simd ? "SIMD yEnc decoder can be used" : "SIMD yEnc decoder isn't available for this CPU");
+	debug("%s", YEncode::crc32_simd ? "SIMD Crc32 routine can be used" : "SIMD Crc32 routine isn't available for this CPU");
+	debug("%s", YEncode::inc_crc32_simd ? "SIMD Crc32 (incremental) routine can be used" : "SIMD Crc32 (incremental) routine isn't available for this CPU");
+
+	printf("%s\n", YEncode::decode_simd ? "SIMD yEnc decoder can be used" : "SIMD yEnc decoder isn't available for this CPU");
+	printf("%s\n", YEncode::crc32_simd ? "SIMD Crc32 routine can be used" : "SIMD Crc32 routine isn't available for this CPU");
+	printf("%s\n", YEncode::inc_crc32_simd ? "SIMD Crc32 (incremental) routine can be used" : "SIMD Crc32 (incremental) routine isn't available for this CPU");
+}
+
 void Decoder::Clear()
 {
 	m_articleFilename.Clear();
diff --git a/daemon/nntp/Decoder.h b/daemon/nntp/Decoder.h
index a6891aea..16894bfa 100644
--- a/daemon/nntp/Decoder.h
+++ b/daemon/nntp/Decoder.h
@@ -45,6 +45,7 @@ public:
 
 	static const char* FormatNames[];
 
+	static void Init();
 	virtual ~Decoder() {}
 	virtual EStatus Check() = 0;
 	virtual void Clear();
diff --git a/lib/yencode/ArmCrc.cpp b/lib/yencode/ArmCrc.cpp
new file mode 100644
index 00000000..85a8a973
--- /dev/null
+++ b/lib/yencode/ArmCrc.cpp
@@ -0,0 +1,94 @@
+/*
+ *  Based on node-yencode library by Anime Tosho:
+ *  https://github.com/animetosho/node-yencode
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// inspired off https://github.com/jocover/crc32_armv8/blob/master/crc32_armv8.c
+
+#include "nzbget.h"
+
+#ifdef __ARM_FEATURE_CRC32
+#include <arm_acle.h>
+#endif
+
+namespace YEncode
+{
+#ifdef __ARM_FEATURE_CRC32
+
+inline uint32_t crc_arm(uint32_t crc, const unsigned char *src, long len) {
+	// initial alignment
+	if (len >= 16) { // 16 is an arbitrary number; it just needs to be >=8
+		if ((uintptr_t)src & sizeof(uint8_t)) {
+			crc = __crc32b(crc, *src);
+			src++;
+			len--;
+		}
+		if ((uintptr_t)src & sizeof(uint16_t)) {
+			crc = __crc32h(crc, *((uint16_t *)src));
+			src += sizeof(uint16_t);
+			len -= sizeof(uint16_t);
+		}
+		
+#ifdef __aarch64__
+		if ((uintptr_t)src & sizeof(uint32_t)) {
+			crc = __crc32w(crc, *((uint32_t *)src));
+			src += sizeof(uint32_t);
+			len -= sizeof(uint32_t);
+		}
+	}
+	while ((len -= sizeof(uint64_t)) >= 0) {
+		crc = __crc32d(crc, *((uint64_t *)src));
+		src += sizeof(uint64_t);
+	}
+	if (len & sizeof(uint32_t)) {
+		crc = __crc32w(crc, *((uint32_t *)src));
+		src += sizeof(uint32_t);
+	}
+#else
+	}
+	while ((len -= sizeof(uint32_t)) >= 0) {
+		crc = __crc32w(crc, *((uint32_t *)src));
+		src += sizeof(uint32_t);
+	}
+#endif
+	if (len & sizeof(uint16_t)) {
+		crc = __crc32h(crc, *((uint16_t *)src));
+		src += sizeof(uint16_t);
+	}
+	if (len & sizeof(uint8_t))
+		crc = __crc32b(crc, *src);
+	
+	return crc;
+}
+
+uint32_t do_crc32_arm(const unsigned char *src, long len)
+{
+	return ~crc_arm(~0, src, len);
+}
+
+extern uint32_t (*crc32_arm)(const unsigned char *src, long len);
+extern uint32_t (*inc_crc32_simd)(uint32_t crc, const unsigned char* src, long len);
+#endif
+
+void init_crc32_arm()
+{
+#ifdef __ARM_FEATURE_CRC32
+	crc32_arm = &do_crc32_arm;
+	inc_crc32_simd = &crc_arm;
+#endif
+}
+
+}
diff --git a/lib/yencode/NeonDecoder.cpp b/lib/yencode/NeonDecoder.cpp
new file mode 100644
index 00000000..14e008f5
--- /dev/null
+++ b/lib/yencode/NeonDecoder.cpp
@@ -0,0 +1,272 @@
+/*
+ *  Based on node-yencode library by Anime Tosho:
+ *  https://github.com/animetosho/node-yencode
+ *
+ *  Copyright (C) 2017 Anime Tosho (animetosho)
+ *  Copyright (C) 2017 Andrey Prygunkov <hugbug@users.sourceforge.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "nzbget.h"
+
+#include "YEncode.h"
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+namespace YEncode
+{
+#ifdef __ARM_NEON
+
+// combine two 8-bit ints into a 16-bit one
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define UINT16_PACK(a, b) ((a) | ((b) << 8))
+#else
+#define UINT16_PACK(a, b) (((a) << 8) | (b))
+#endif
+
+// table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
+static const unsigned char BitsSetTable256[256] = 
+{
+#   define B2(n) n,     n+1,     n+1,     n+2
+#   define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2)
+#   define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2)
+    B6(0), B6(1), B6(1), B6(2)
+#undef B2
+#undef B4
+#undef B6
+};
+
+static uint16_t neon_movemask(uint8x16_t in) {
+	uint8x16_t mask = vandq_u8(in, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
+# if defined(__aarch64__) && 0
+	// TODO: is this better?
+	return (vaddv_u8(vget_high_u8(mask)) << 8) | vaddv_u8(vget_low_u8(mask));
+# else
+	uint8x8_t res = vpadd_u8(vget_low_u8(mask), vget_high_u8(mask));
+	res = vpadd_u8(res, res);
+	res = vpadd_u8(res, res);
+	return vget_lane_u16(vreinterpret_u16_u8(res), 0);
+# endif
+}
+
+uint8_t eqFixLUT[256];
+alignas(32) uint8x8_t eqAddLUT[256];
+alignas(32) uint8x8_t unshufLUT[256];
+alignas(32) static const uint8_t pshufb_combine_table[272] = {
+	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,
+	0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,
+	0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,
+	0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,
+	0x00,0x01,0x02,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,
+	0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,
+	0x00,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
+	0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
+};
+
+size_t do_decode_neon(const unsigned char* src, unsigned char* dest, size_t len, char* state) {
+	if(len <= sizeof(uint8x16_t)*2) return decode_scalar(src, dest, len, state);
+	
+	unsigned char *p = dest; // destination pointer
+	unsigned long i = 0; // input position
+	unsigned char escFirst = 0; // input character; first char needs escaping
+	unsigned int nextMask = 0;
+	char tState = 0;
+	char* pState = state ? state : &tState;
+	if((uintptr_t)src & ((sizeof(uint8x16_t)-1))) {
+		// find source memory alignment
+		unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(uint8x16_t)-1)) & ~(sizeof(uint8x16_t)-1));
+		
+		i = aSrc - src;
+		p += decode_scalar(src, dest, i, pState);
+	}
+	
+	// handle finicky case of \r\n. straddled across initial boundary
+	if(*pState == 0 && i+1 < len && src[i] == '.')
+		nextMask = 1;
+	else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.'))
+		nextMask = 2;
+
+	escFirst = *pState == 1;
+	
+	if(i + (sizeof(uint8x16_t)+1) < len) {
+		// our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
+		size_t dLen = len - (sizeof(uint8x16_t)+1);
+		dLen = ((dLen-i) + 0xf) & ~0xf;
+		uint8_t* dSrc = (uint8_t*)src + dLen + i;
+		long dI = -dLen;
+		i += dLen;
+		
+		for(; dI; dI += sizeof(uint8x16_t)) {
+			uint8x16_t data = vld1q_u8(dSrc + dI);
+			
+			// search for special chars
+			uint8x16_t cmpEq = vceqq_u8(data, vdupq_n_u8('=')),
+			cmp = vorrq_u8(
+				vorrq_u8(
+					vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0a0d))), // \r\n
+					vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0d0a)))  // \n\r
+				),
+				cmpEq
+			);
+			uint16_t mask = neon_movemask(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later
+			
+			uint8x16_t oData;
+			if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers?
+				// first byte needs escaping due to preceeding = in last loop iteration
+				oData = vsubq_u8(data, (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42});
+			} else {
+				oData = vsubq_u8(data, vdupq_n_u8(42));
+			}
+			mask &= ~escFirst;
+			mask |= nextMask;
+			
+			if (mask != 0) {
+				// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
+				// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
+				
+				// firstly, resolve invalid sequences of = to deal with cases like '===='
+				uint16_t maskEq = neon_movemask(cmpEq);
+				uint16_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
+				maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp;
+				
+				escFirst = (maskEq >> (sizeof(uint8x16_t)-1));
+				// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
+				maskEq <<= 1;
+				mask &= ~maskEq;
+				
+				// unescape chars following `=`
+				oData = vaddq_u8(
+					oData,
+					vcombine_u8(
+						vld1_u8((uint8_t*)(eqAddLUT + (maskEq&0xff))),
+						vld1_u8((uint8_t*)(eqAddLUT + ((maskEq>>8)&0xff)))
+					)
+				);
+				
+				// handle \r\n. sequences
+				// RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
+				// find instances of \r\n
+				uint8x16_t tmpData1, tmpData2;
+				uint8x16_t nextData = vld1q_u8(dSrc + dI + sizeof(uint8x16_t));
+				tmpData1 = vextq_u8(data, nextData, 1);
+				tmpData2 = vextq_u8(data, nextData, 2);
+				uint8x16_t cmp1 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(data), vdupq_n_u16(0x0a0d)));
+				uint8x16_t cmp2 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData1), vdupq_n_u16(0x0a0d)));
+				// prepare to merge the two comparisons
+				cmp1 = vextq_u8(cmp1, vdupq_n_u8(0), 1);
+				// find all instances of .
+				tmpData2 = vceqq_u8(tmpData2, vdupq_n_u8('.'));
+				// merge matches of \r\n with those for .
+				uint16_t killDots = neon_movemask(
+					vandq_u8(tmpData2, vorrq_u8(cmp1, cmp2))
+				);
+				mask |= (killDots << 2) & 0xffff;
+				nextMask = killDots >> (sizeof(uint8x16_t)-2);
+
+				// all that's left is to 'compress' the data (skip over masked chars)
+				unsigned char skipped = BitsSetTable256[mask & 0xff];
+				// lookup compress masks and shuffle
+				oData = vcombine_u8(
+					vtbl1_u8(vget_low_u8(oData),  vld1_u8((uint8_t*)(unshufLUT + (mask&0xff)))),
+					vtbl1_u8(vget_high_u8(oData), vld1_u8((uint8_t*)(unshufLUT + (mask>>8))))
+				);
+				// compact down
+				uint8x16_t compact = vld1q_u8(pshufb_combine_table + skipped*sizeof(uint8x16_t));
+# ifdef __aarch64__
+				oData = vqtbl1q_u8(oData, compact);
+# else
+				uint8x8x2_t dataH = {vget_low_u8(oData), vget_high_u8(oData)};
+				oData = vcombine_u8(vtbl2_u8(dataH, vget_low_u8(compact)),
+				                    vtbl2_u8(dataH, vget_high_u8(compact)));
+# endif
+				vst1q_u8(p, oData);
+				
+				// increment output position
+				p += sizeof(uint8x16_t) - skipped - BitsSetTable256[mask >> 8];
+				
+			} else {
+				vst1q_u8(p, oData);
+				p += sizeof(uint8x16_t);
+				escFirst = 0;
+				nextMask = 0;
+			}
+		}
+		
+		if(escFirst) *pState = 1; // escape next character
+		else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n
+		else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r
+		else *pState = 3;
+	}
+	
+	// end alignment
+	if(i < len) {
+		p += decode_scalar(src + i, p, len - i, pState);
+	}
+	
+	return p - dest;
+}
+
+extern size_t (*decode_neon)(const unsigned char* src, unsigned char* dest, size_t len, char* state);
+#endif
+
+void init_decode_neon() {
+#ifdef __ARM_NEON
+	decode_neon = &do_decode_neon;
+
+	for(int i=0; i<256; i++) {
+		int k = i;
+		uint8_t res[8];
+		int p = 0;
+		
+		// fix LUT
+		k = i;
+		p = 0;
+		for(int j=0; j<8; j++) {
+			k = i >> j;
+			if(k & 1) {
+				p |= 1 << j;
+				j++;
+			}
+		}
+		eqFixLUT[i] = p;
+		
+		// sub LUT
+		k = i;
+		for(int j=0; j<8; j++) {
+			res[j] = (k & 1) ? 192 /* == -64 */ : 0;
+			k >>= 1;
+		}
+		vst1_u8((uint8_t*)(eqAddLUT + i), vld1_u8(res));
+		
+		k = i;
+		p = 0;
+		for(int j=0; j<8; j++) {
+			if(!(k & 1)) {
+				res[p++] = j;
+			}
+			k >>= 1;
+		}
+		for(; p<8; p++)
+			res[p] = 0;
+		vst1_u8((uint8_t*)(unshufLUT + i), vld1_u8(res));
+	}
+#endif
+}
+
+}
diff --git a/lib/yencode/PclmulCrc.cpp b/lib/yencode/PclmulCrc.cpp
new file mode 100644
index 00000000..7080fa62
--- /dev/null
+++ b/lib/yencode/PclmulCrc.cpp
@@ -0,0 +1,444 @@
+/*
+ *  Based on node-yencode library by Anime Tosho:
+ *  https://github.com/animetosho/node-yencode
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// taken from zlib-ng / Intel's zlib patch, modified to remove zlib dependencies
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ 
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ *  Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *  Jim Guilford	<james.guilford@intel.com>
+ *  Vinodh Gopal	<vinodh.gopal@intel.com>
+ *  Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *  Jim Kukunas	 <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "nzbget.h"
+
+#ifdef __PCLMUL__
+#include <immintrin.h>
+#endif
+
+namespace YEncode
+{
+#ifdef __PCLMUL__
+
+void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+	const __m128i xmm_fold4 = _mm_set_epi32(
+			0x00000001, 0x54442bd4,
+			0x00000001, 0xc6e41596);
+
+	__m128i x_tmp3;
+	__m128 ps_crc0, ps_crc3, ps_res;
+
+	x_tmp3 = *xmm_crc3;
+
+	*xmm_crc3 = *xmm_crc0;
+	*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+	*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+	ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+	ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+	ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
+
+	*xmm_crc0 = *xmm_crc1;
+	*xmm_crc1 = *xmm_crc2;
+	*xmm_crc2 = x_tmp3;
+	*xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+	const __m128i xmm_fold4 = _mm_set_epi32(
+			0x00000001, 0x54442bd4,
+			0x00000001, 0xc6e41596);
+
+	__m128i x_tmp3, x_tmp2;
+	__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
+
+	x_tmp3 = *xmm_crc3;
+	x_tmp2 = *xmm_crc2;
+
+	*xmm_crc3 = *xmm_crc1;
+	*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+	*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+	ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+	ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+	ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
+
+	*xmm_crc2 = *xmm_crc0;
+	*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+	*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+	ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+	ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+	ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
+
+	*xmm_crc0 = x_tmp2;
+	*xmm_crc1 = x_tmp3;
+	*xmm_crc2 = _mm_castps_si128(ps_res20);
+	*xmm_crc3 = _mm_castps_si128(ps_res31);
+}
+
+void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+	const __m128i xmm_fold4 = _mm_set_epi32(
+			0x00000001, 0x54442bd4,
+			0x00000001, 0xc6e41596);
+
+	__m128i x_tmp3;
+	__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
+
+	x_tmp3 = *xmm_crc3;
+
+	*xmm_crc3 = *xmm_crc2;
+	*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+	*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+	ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+	ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+	ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
+
+	*xmm_crc2 = *xmm_crc1;
+	*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+	*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+	ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+	ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+	ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
+
+	*xmm_crc1 = *xmm_crc0;
+	*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+	*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+	ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+	ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+	ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
+
+	*xmm_crc0 = x_tmp3;
+	*xmm_crc1 = _mm_castps_si128(ps_res10);
+	*xmm_crc2 = _mm_castps_si128(ps_res21);
+	*xmm_crc3 = _mm_castps_si128(ps_res32);
+}
+
+void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
+	const __m128i xmm_fold4 = _mm_set_epi32(
+			0x00000001, 0x54442bd4,
+			0x00000001, 0xc6e41596);
+
+	__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
+	__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
+	__m128 ps_t0, ps_t1, ps_t2, ps_t3;
+	__m128 ps_res0, ps_res1, ps_res2, ps_res3;
+
+	x_tmp0 = *xmm_crc0;
+	x_tmp1 = *xmm_crc1;
+	x_tmp2 = *xmm_crc2;
+	x_tmp3 = *xmm_crc3;
+
+	*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+	x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
+	ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+	ps_t0 = _mm_castsi128_ps(x_tmp0);
+	ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
+
+	*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+	x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
+	ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+	ps_t1 = _mm_castsi128_ps(x_tmp1);
+	ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
+
+	*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+	x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
+	ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+	ps_t2 = _mm_castsi128_ps(x_tmp2);
+	ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
+
+	*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
+	x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
+	ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+	ps_t3 = _mm_castsi128_ps(x_tmp3);
+	ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
+
+	*xmm_crc0 = _mm_castps_si128(ps_res0);
+	*xmm_crc1 = _mm_castps_si128(ps_res1);
+	*xmm_crc2 = _mm_castps_si128(ps_res2);
+	*xmm_crc3 = _mm_castps_si128(ps_res3);
+}
+
+alignas(32) const unsigned  pshufb_shf_table[60] = {
+	0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
+	0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
+	0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
+	0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
+	0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
+	0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
+	0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl  9 (16 - 7)/shr7 */
+	0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl  8 (16 - 8)/shr8 */
+	0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl  7 (16 - 9)/shr9 */
+	0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl  6 (16 -10)/shr10*/
+	0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl  5 (16 -11)/shr11*/
+	0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl  4 (16 -12)/shr12*/
+	0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl  3 (16 -13)/shr13*/
+	0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl  2 (16 -14)/shr14*/
+	0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b  /* shl  1 (16 -15)/shr15*/
+};
+
+void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
+		__m128i *xmm_crc2, __m128i *xmm_crc3, __m128i *xmm_crc_part) {
+
+	const __m128i xmm_fold4 = _mm_set_epi32(
+			0x00000001, 0x54442bd4,
+			0x00000001, 0xc6e41596);
+	const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
+
+	__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
+	__m128i xmm_a0_0, xmm_a0_1;
+	__m128 ps_crc3, psa0_0, psa0_1, ps_res;
+
+	xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
+	xmm_shr = xmm_shl;
+	xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
+
+	xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
+
+	*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
+	xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
+	*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
+
+	*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
+	xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
+	*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
+
+	*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
+	xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
+	*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
+
+	*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
+	*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
+	*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
+
+	xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
+	xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
+
+	ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+	psa0_0 = _mm_castsi128_ps(xmm_a0_0);
+	psa0_1 = _mm_castsi128_ps(xmm_a0_1);
+
+	ps_res = _mm_xor_ps(ps_crc3, psa0_0);
+	ps_res = _mm_xor_ps(ps_res, psa0_1);
+
+	*xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+alignas(16) const unsigned crc_k[] = {
+	0xccaa009e, 0x00000000, /* rk1 */
+	0x751997d0, 0x00000001, /* rk2 */
+	0xccaa009e, 0x00000000, /* rk5 */
+	0x63cd6124, 0x00000001, /* rk6 */
+	0xf7011640, 0x00000001, /* rk7 */
+	0xdb710640, 0x00000001  /* rk8 */
+};
+
+alignas(16) const unsigned crc_mask[4] = {
+	0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
+};
+
+alignas(16) const unsigned crc_mask2[4] = {
+	0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
+};
+
+inline uint32_t crc_fold(const unsigned char *src, long len) {
+	unsigned long algn_diff;
+	__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+
+	__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+	__m128i xmm_crc1 = _mm_setzero_si128();
+	__m128i xmm_crc2 = _mm_setzero_si128();
+	__m128i xmm_crc3 = _mm_setzero_si128();
+	__m128i xmm_crc_part;
+
+	if (len < 16) {
+		if (len == 0)
+			return 0;
+		xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+		goto partial;
+	}
+
+	algn_diff = (0 - (uintptr_t)src) & 0xF;
+	if (algn_diff) {
+		xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+
+		src += algn_diff;
+		len -= algn_diff;
+
+		partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
+			&xmm_crc_part);
+	}
+
+	while ((len -= 64) >= 0) {
+		xmm_t0 = _mm_load_si128((__m128i *)src);
+		xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+		xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+		xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+
+		fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+		xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+		xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+		xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+		xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+
+		src += 64;
+	}
+
+	/*
+	 * len = num bytes left - 64
+	 */
+	if (len + 16 >= 0) {
+		len += 16;
+
+		xmm_t0 = _mm_load_si128((__m128i *)src);
+		xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+		xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+
+		fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+		xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+		xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+		xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+
+		if (len == 0)
+			goto done;
+
+		xmm_crc_part = _mm_load_si128((__m128i *)src + 3);
+	} else if (len + 32 >= 0) {
+		len += 32;
+
+		xmm_t0 = _mm_load_si128((__m128i *)src);
+		xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+
+		fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+		xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+		xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+
+		if (len == 0)
+			goto done;
+
+		xmm_crc_part = _mm_load_si128((__m128i *)src + 2);
+	} else if (len + 48 >= 0) {
+		len += 48;
+
+		xmm_t0 = _mm_load_si128((__m128i *)src);
+
+		fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+		xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+
+		if (len == 0)
+			goto done;
+
+		xmm_crc_part = _mm_load_si128((__m128i *)src + 1);
+	} else {
+		len += 64;
+		if (len == 0)
+			goto done;
+		xmm_crc_part = _mm_load_si128((__m128i *)src);
+	}
+
+partial:
+	partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
+		&xmm_crc_part);
+done:
+{
+	const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
+	const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
+
+	uint32_t crc;
+	__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
+
+	/*
+	 * k1
+	 */
+	crc_fold = _mm_load_si128((__m128i *)crc_k);
+
+	x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
+	xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
+	xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
+	xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
+
+	x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
+	xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
+	xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
+	xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
+
+	x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
+	xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
+	xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
+	xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+
+	/*
+	 * k5
+	 */
+	crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
+
+	xmm_crc0 = xmm_crc3;
+	xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+	xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
+	xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+
+	xmm_crc0 = xmm_crc3;
+	xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
+	xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+	xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+	xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
+
+	/*
+	 * k7
+	 */
+	xmm_crc1 = xmm_crc3;
+	xmm_crc2 = xmm_crc3;
+	crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
+
+	xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+	xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+	xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
+
+	xmm_crc2 = xmm_crc3;
+	xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+	xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+	xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
+
+	crc = _mm_extract_epi32(xmm_crc3, 2);
+	return ~crc;
+}
+
+}
+
+extern uint32_t (*crc32_pclmul)(const unsigned char *src, long len);
+#endif
+
+void init_crc32_pclmul()
+{
+#ifdef __PCLMUL__
+	crc32_pclmul = &crc_fold;
+#endif
+}
+
+}
diff --git a/lib/yencode/ScalarDecoder.cpp b/lib/yencode/ScalarDecoder.cpp
new file mode 100644
index 00000000..f128e895
--- /dev/null
+++ b/lib/yencode/ScalarDecoder.cpp
@@ -0,0 +1,130 @@
+/*
+ *  Based on node-yencode library by Anime Tosho:
+ *  https://github.com/animetosho/node-yencode
+ *
+ *  Copyright (C) 2017 Anime Tosho (animetosho)
+ *  Copyright (C) 2017 Andrey Prygunkov <hugbug@users.sourceforge.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "nzbget.h"
+
+namespace YEncode
+{
+
+// combine two 8-bit ints into a 16-bit one
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define UINT16_PACK(a, b) ((a) | ((b) << 8))
+#else
+#define UINT16_PACK(a, b) (((a) << 8) | (b))
+#endif
+
+// state var: refers to the previous state - only used for incremental processing
+//   0: previous characters are `\r\n` OR there is no previous character
+//   1: previous character is `=`
+//   2: previous character is `\r`
+//   3: previous character is none of the above
+size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len, char* state) {
+	unsigned char *es = (unsigned char*)src + len; // end source pointer
+	unsigned char *p = dest; // destination pointer
+	long i = -len; // input position
+	unsigned char c; // input character
+
+	if (len < 1) return 0;
+
+	if (state) switch (*state) {
+		case 1:
+			c = es[i];
+			*p++ = c - 42 - 64;
+			i++;
+			if (c == '\r' && i < 0) {
+				*state = 2;
+				// fall through to case 2
+			}
+			else {
+				*state = 3;
+				break;
+			}
+		case 2:
+			if (es[i] != '\n') break;
+			i++;
+			*state = 0; // now `\r\n`
+			if (i >= 0) return 0;
+		case 0:
+			// skip past first dot
+			if (es[i] == '.') i++;
+	}
+	else // treat as *state == 0
+		if (es[i] == '.') i++;
+
+	for (; i < -2; i++) {
+		c = es[i];
+		switch (c) {
+			case '\r':
+				// skip past \r\n. sequences
+				if (*(uint16_t*)(es + i + 1) == UINT16_PACK('\n', '.'))
+					i += 2;
+			case '\n':
+				continue;
+			case '=':
+				c = es[i + 1];
+				*p++ = c - 42 - 64;
+				i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
+				continue;
+			default:
+				*p++ = c - 42;
+		}
+	}
+	if (state) *state = 3;
+
+	if (i == -2) { // 2nd last char
+		c = es[i];
+		switch (c) {
+			case '\r':
+				if (state && es[i + 1] == '\n') {
+					*state = 0;
+					return p - dest;
+				}
+			case '\n':
+				break;
+			case '=':
+				c = es[i + 1];
+				*p++ = c - 42 - 64;
+				i += (c != '\r');
+				break;
+			default:
+				*p++ = c - 42;
+		}
+		i++;
+	}
+
+	// do final char; we process this separately to prevent an overflow if the final char is '='
+	if (i == -1) {
+		c = es[i];
+		if (c != '\n' && c != '\r' && c != '=') {
+			*p++ = c - 42;
+		}
+		else if (state) {
+			if (c == '=') *state = 1;
+			else if (c == '\r') *state = 2;
+			else *state = 3;
+		}
+	}
+
+	return p - dest;
+}
+
+}
diff --git a/lib/yencode/SimdInit.cpp b/lib/yencode/SimdInit.cpp
new file mode 100644
index 00000000..a5309606
--- /dev/null
+++ b/lib/yencode/SimdInit.cpp
@@ -0,0 +1,141 @@
+/*
+ *  Based on node-yencode library by Anime Tosho:
+ *  https://github.com/animetosho/node-yencode
+ *
+ *  Copyright (C) 2017 Anime Tosho (animetosho)
+ *  Copyright (C) 2017 Andrey Prygunkov <hugbug@users.sourceforge.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "nzbget.h"
+
+#if (defined(__i686__) || defined(__amd64__)) && !defined(WIN32)
+#include <cpuid.h>
+#endif
+
+#include "YEncode.h"
+
+namespace YEncode
+{
+
+size_t (*decode)(const unsigned char*, unsigned char*, size_t, char* state) = nullptr;
+size_t (*decode_simd)(const unsigned char*, unsigned char*, size_t, char* state) = nullptr;
+uint32_t (*crc32_simd)(const unsigned char* src, long len) = nullptr;
+uint32_t (*inc_crc32_simd)(uint32_t crc, const unsigned char* src, long len) = nullptr;
+
+#if defined(__i686__) || defined(__amd64__)
+size_t (*decode_sse2)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr;
+extern void init_decode_sse2();
+size_t (*decode_ssse3)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr;
+extern void init_decode_ssse3();
+uint32_t (*crc32_pclmul)(const unsigned char *src, long len) = nullptr;
+extern void init_crc32_pclmul();
+
+class CpuId
+{
+	uint32_t regs[4];
+public:
+	CpuId(unsigned level)
+	{
+#ifdef WIN32
+		__cpuid((int *)regs, (int)level);
+#else
+		__cpuid(level, regs[0], regs[1], regs[2], regs[3]);
+#endif
+	}
+	const uint32_t &EAX() const {return regs[0];}
+	const uint32_t &EBX() const {return regs[1];}
+	const uint32_t &ECX() const {return regs[2];}
+	const uint32_t &EDX() const {return regs[3];}
+};
+#endif
+
+#if defined(__arm__) || defined(__aarch64__)
+size_t (*decode_neon)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr;
+extern void init_decode_neon();
+uint32_t (*crc32_arm)(const unsigned char *src, long len) = nullptr;
+extern void init_crc32_arm();
+#endif
+
+void init()
+{
+	decode = &decode_scalar;
+
+#if defined(__i686__) || defined(__amd64__)
+	CpuId cpuid(1);
+
+	bool cpu_supports_sse2 = cpuid.EDX() & 0x04000000;
+	bool cpu_supports_ssse3 = cpuid.ECX() & 0x00000200;
+	bool cpu_supports_sse41 = cpuid.ECX() & 0x00080000;
+	bool cpu_supports_pclmul = cpuid.ECX() & 0x00000002;
+
+	if (cpu_supports_sse2)
+	{
+		init_decode_sse2();
+		decode_simd = decode_sse2;
+	}
+	if (cpu_supports_ssse3)
+	{
+		init_decode_ssse3();
+		if (decode_ssse3)
+		{
+			decode_simd = decode_ssse3;
+		}
+	}
+	if (cpu_supports_sse41 && cpu_supports_pclmul)
+	{
+		init_crc32_pclmul();
+		crc32_simd = crc32_pclmul;
+	}
+#endif
+
+#if defined(__arm__) || defined(__aarch64__)
+	bool cpu_supports_neon = false;
+	bool cpu_supports_crc = false;
+
+#ifdef __linux__
+	if (FILE* file = fopen("/proc/cpuinfo", "r"))
+	{
+		char buf[200];
+		while (fgets(buf, sizeof(buf), file))
+		{
+			cpu_supports_neon |= !strncasecmp(buf, "Features", 8) &&
+				(strstr(buf, " neon ") || strstr(buf, " asimd "));
+			cpu_supports_crc |= !strncasecmp(buf, "Features", 8) && strstr(buf, " crc32 ");
+		}
+		fclose(file);
+	}
+#endif
+
+	if (cpu_supports_neon)
+	{
+		init_decode_neon();
+		decode_simd = decode_neon;
+	}
+	if (cpu_supports_crc)
+	{
+		init_crc32_arm();
+		crc32_simd = crc32_arm;
+	}
+#endif
+
+	if (decode_simd)
+	{
+		decode = decode_simd;
+	}
+}
+
+}
diff --git a/lib/yencode/Sse2Decoder.cpp b/lib/yencode/Sse2Decoder.cpp
new file mode 100644
index 00000000..7d7cae3f
--- /dev/null
+++ b/lib/yencode/Sse2Decoder.cpp
@@ -0,0 +1,230 @@
+/*
+ *  Based on node-yencode library by Anime Tosho:
+ *  https://github.com/animetosho/node-yencode
+ *
+ *  Copyright (C) 2017 Anime Tosho (animetosho)
+ *  Copyright (C) 2017 Andrey Prygunkov <hugbug@users.sourceforge.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "nzbget.h"
+
+#include "YEncode.h"
+
+#ifdef __SSE2__
+#include <immintrin.h>
+#endif
+
+namespace YEncode
+{
+#ifdef __SSE2__
+
+// combine two 8-bit ints into a 16-bit one
+#define UINT16_PACK(a, b) ((a) | ((b) << 8))
+
+#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/
+
+#define STOREU_XMM(dest, xmm) \
+  _mm_storeu_si128((__m128i*)(dest), xmm)
+
+#define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \
+	_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \
+	(b) \
+))
+
+uint8_t eqFixLUT[256];
+alignas(32) __m64 eqAddLUT[256];
+
+size_t do_decode_sse2(const unsigned char* src, unsigned char* dest, size_t len, char* state) {
+	if(len <= sizeof(__m128i)*2) return decode_scalar(src, dest, len, state);
+	
+	unsigned char *p = dest; // destination pointer
+	unsigned long i = 0; // input position
+	unsigned char escFirst = 0; // input character; first char needs escaping
+	unsigned int nextMask = 0;
+	char tState = 0;
+	char* pState = state ? state : &tState;
+	if((uintptr_t)src & ((sizeof(__m128i)-1))) {
+		// find source memory alignment
+		unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(__m128i)-1)) & ~(sizeof(__m128i)-1));
+		
+		i = aSrc - src;
+		p += decode_scalar(src, dest, i, pState);
+	}
+	
+	if(*pState == 0 && i+1 < len && src[i] == '.')
+		nextMask = 1;
+	else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.'))
+		nextMask = 2;
+
+	escFirst = *pState == 1;
+	
+	if(i + (sizeof(__m128i)+1) < len) {
+		// our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
+		size_t dLen = len - (sizeof(__m128i)+1);
+		dLen = ((dLen-i) + 0xf) & ~0xf;
+		unsigned char* dSrc = (unsigned char*)src + dLen + i;
+		long dI = -dLen;
+		i += dLen;
+		
+		for(; dI; dI += sizeof(__m128i)) {
+			__m128i data = _mm_load_si128((__m128i *)(dSrc + dI));
+			
+			// search for special chars
+			__m128i cmpEq = _mm_cmpeq_epi8(data, _mm_set1_epi8('=')),
+			cmp = _mm_or_si128(
+				_mm_or_si128(
+					_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), // \r\n
+					_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a))  // \n\r
+				),
+				cmpEq
+			);
+
+			unsigned int mask = _mm_movemask_epi8(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later
+			
+			__m128i oData;
+			if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers?
+				// first byte needs escaping due to preceeding = in last loop iteration
+				oData = _mm_sub_epi8(data, _mm_set_epi8(42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42+64));
+			} else {
+				oData = _mm_sub_epi8(data, _mm_set1_epi8(42));
+			}
+			mask &= ~escFirst;
+			mask |= nextMask;
+			
+			if (mask != 0) {
+				// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
+				// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
+
+				// firstly, resolve invalid sequences of = to deal with cases like '===='
+				unsigned int maskEq = _mm_movemask_epi8(cmpEq);
+				unsigned int tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
+				maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp;
+				
+				escFirst = (maskEq >> (sizeof(__m128i)-1));
+				// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
+				maskEq <<= 1;
+				mask &= ~maskEq;
+				
+				// unescape chars following `=`
+				oData = _mm_add_epi8(
+					oData,
+					LOAD_HALVES(
+						eqAddLUT + (maskEq&0xff),
+						eqAddLUT + ((maskEq>>8)&0xff)
+					)
+				);
+
+				// handle \r\n. sequences
+				// RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
+				// find instances of \r\n
+				__m128i tmpData1, tmpData2;
+				tmpData1 = _mm_insert_epi16(_mm_srli_si128(data, 1), *(uint16_t*)(dSrc + dI + sizeof(__m128i)-1), 7);
+				tmpData2 = _mm_insert_epi16(_mm_srli_si128(data, 2), *(uint16_t*)(dSrc + dI + sizeof(__m128i)), 7);
+				__m128i cmp1 = _mm_cmpeq_epi16(data, _mm_set1_epi16(0x0a0d));
+				__m128i cmp2 = _mm_cmpeq_epi16(tmpData1, _mm_set1_epi16(0x0a0d));
+				// prepare to merge the two comparisons
+				cmp1 = _mm_srli_si128(cmp1, 1);
+				// find all instances of .
+				tmpData2 = _mm_cmpeq_epi8(tmpData2, _mm_set1_epi8('.'));
+				// merge matches of \r\n with those for .
+				unsigned int killDots = _mm_movemask_epi8(
+					_mm_and_si128(tmpData2, _mm_or_si128(cmp1, cmp2))
+				);
+				mask |= (killDots << 2) & 0xffff;
+				nextMask = killDots >> (sizeof(__m128i)-2);
+
+				// all that's left is to 'compress' the data (skip over masked chars)
+				alignas(32) uint32_t mmTmp[4];
+				_mm_store_si128((__m128i*)mmTmp, oData);
+				
+				for(int j=0; j<4; j++) {
+					if(mask & 0xf) {
+						unsigned char* pMmTmp = (unsigned char*)(mmTmp + j);
+						unsigned int maskn = ~mask;
+						*p = pMmTmp[0];
+						p += (maskn & 1);
+						*p = pMmTmp[1];
+						p += (maskn & 2) >> 1;
+						*p = pMmTmp[2];
+						p += (maskn & 4) >> 2;
+						*p = pMmTmp[3];
+						p += (maskn & 8) >> 3;
+					} else {
+						*(uint32_t*)p = mmTmp[j];
+						p += 4;
+					}
+					mask >>= 4;
+				}
+			} else {
+				STOREU_XMM(p, oData);
+				p += XMM_SIZE;
+				escFirst = 0;
+				nextMask = 0;
+			}
+		}
+		
+		if(escFirst) *pState = 1; // escape next character
+		else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n
+		else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r
+		else *pState = 3;
+	}
+	
+	// end alignment
+	if(i < len) {
+		p += decode_scalar(src + i, p, len - i, pState);
+	}
+	
+	return p - dest;
+}
+
+extern size_t (*decode_sse2)(const unsigned char* src, unsigned char* dest, size_t len, char* state);
+#endif
+
+void init_decode_sse2() {
+#ifdef __SSE2__
+	decode_sse2 = &do_decode_sse2;
+
+	// generate unshuf LUT
+	for(int i=0; i<256; i++) {
+		int k = i;
+		uint8_t res[8];
+		int p = 0;
+		
+		// fix LUT
+		k = i;
+		p = 0;
+		for(int j=0; j<8; j++) {
+			k = i >> j;
+			if(k & 1) {
+				p |= 1 << j;
+				j++;
+			}
+		}
+		eqFixLUT[i] = p;
+		
+		// sub LUT
+		k = i;
+		for(int j=0; j<8; j++) {
+			res[j] = (k & 1) ? 192 /* == -64 */ : 0;
+			k >>= 1;
+		}
+		_mm_storel_epi64((__m128i*)(eqAddLUT + i), _mm_loadl_epi64((__m128i*)res));
+	}
+#endif
+}
+
+}
diff --git a/lib/yencode/Ssse3Decoder.cpp b/lib/yencode/Ssse3Decoder.cpp
new file mode 100644
index 00000000..172bda4b
--- /dev/null
+++ b/lib/yencode/Ssse3Decoder.cpp
@@ -0,0 +1,243 @@
+/*
+ *  Based on node-yencode library by Anime Tosho:
+ *  https://github.com/animetosho/node-yencode
+ *
+ *  Copyright (C) 2017 Anime Tosho (animetosho)
+ *  Copyright (C) 2017 Andrey Prygunkov <hugbug@users.sourceforge.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "nzbget.h"
+
+#include "YEncode.h"
+
+#ifdef __SSSE3__
+#include <immintrin.h>
+#endif
+
+namespace YEncode
+{
+#ifdef __SSSE3__
+
+// combine two 8-bit ints into a 16-bit one
+#define UINT16_PACK(a, b) ((a) | ((b) << 8))
+
+#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/
+
+#define STOREU_XMM(dest, xmm) \
+  _mm_storeu_si128((__m128i*)(dest), xmm)
+
+#define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \
+	_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \
+	(b) \
+))
+
+// table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
+static const unsigned char BitsSetTable256[256] = 
+{
+#   define B2(n) n,     n+1,     n+1,     n+2
+#   define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2)
+#   define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2)
+    B6(0), B6(1), B6(1), B6(2)
+#undef B2
+#undef B4
+#undef B6
+};
+
+extern uint8_t eqFixLUT[256];
+extern __m64 eqAddLUT[256];
+
+alignas(32)__m64 unshufLUT[256];
+alignas(32) static const uint8_t _pshufb_combine_table[272] = {
+	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,
+	0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,
+	0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,
+	0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,
+	0x00,0x01,0x02,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,
+	0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,
+	0x00,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
+	0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
+};
+static const __m128i* pshufb_combine_table = (const __m128i*)_pshufb_combine_table;
+
+size_t do_decode_ssse3(const unsigned char* src, unsigned char* dest, size_t len, char* state) {
+	if(len <= sizeof(__m128i)*2) return decode_scalar(src, dest, len, state);
+	
+	unsigned char *p = dest; // destination pointer
+	unsigned long i = 0; // input position
+	unsigned char escFirst = 0; // input character; first char needs escaping
+	unsigned int nextMask = 0;
+	char tState = 0;
+	char* pState = state ? state : &tState;
+	if((uintptr_t)src & ((sizeof(__m128i)-1))) {
+		// find source memory alignment
+		unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(__m128i)-1)) & ~(sizeof(__m128i)-1));
+		
+		i = aSrc - src;
+		p += decode_scalar(src, dest, i, pState);
+	}
+	
+	// handle finicky case of \r\n. straddled across initial boundary
+	if(*pState == 0 && i+1 < len && src[i] == '.')
+		nextMask = 1;
+	else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.'))
+		nextMask = 2;
+
+	escFirst = *pState == 1;
+	
+	if(i + (sizeof(__m128i)+1) < len) {
+		// our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
+		size_t dLen = len - (sizeof(__m128i)+1);
+		dLen = ((dLen-i) + 0xf) & ~0xf;
+		unsigned char* dSrc = (unsigned char*)src + dLen + i;
+		long dI = -dLen;
+		i += dLen;
+		
+		for(; dI; dI += sizeof(__m128i)) {
+			__m128i data = _mm_load_si128((__m128i *)(dSrc + dI));
+			
+			// search for special chars
+			__m128i cmpEq = _mm_cmpeq_epi8(data, _mm_set1_epi8('=')),
+			cmp = _mm_or_si128(
+				_mm_or_si128(
+					_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), // \r\n
+					_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a))  // \n\r
+				),
+				cmpEq
+			);
+
+			unsigned int mask = _mm_movemask_epi8(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later
+			
+			__m128i oData;
+			if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers?
+				// first byte needs escaping due to preceeding = in last loop iteration
+				oData = _mm_sub_epi8(data, _mm_set_epi8(42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42+64));
+			} else {
+				oData = _mm_sub_epi8(data, _mm_set1_epi8(42));
+			}
+			mask &= ~escFirst;
+			mask |= nextMask;
+			
+			if (mask != 0) {
+				// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
+				// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
+				
+				// firstly, resolve invalid sequences of = to deal with cases like '===='
+				unsigned int maskEq = _mm_movemask_epi8(cmpEq);
+				unsigned int tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
+				maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp;
+				
+				escFirst = (maskEq >> (sizeof(__m128i)-1));
+				// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
+				maskEq <<= 1;
+				mask &= ~maskEq;
+				
+				// unescape chars following `=`
+				oData = _mm_add_epi8(
+					oData,
+					LOAD_HALVES(
+						eqAddLUT + (maskEq&0xff),
+						eqAddLUT + ((maskEq>>8)&0xff)
+					)
+				);
+
+				// handle \r\n. sequences
+				// RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
+				// find instances of \r\n
+				__m128i tmpData1, tmpData2;
+				__m128i nextData = _mm_load_si128((__m128i *)(dSrc + dI) + 1);
+				tmpData1 = _mm_alignr_epi8(nextData, data, 1);
+				tmpData2 = _mm_alignr_epi8(nextData, data, 2);
+				__m128i cmp1 = _mm_cmpeq_epi16(data, _mm_set1_epi16(0x0a0d));
+				__m128i cmp2 = _mm_cmpeq_epi16(tmpData1, _mm_set1_epi16(0x0a0d));
+				// prepare to merge the two comparisons
+				cmp1 = _mm_srli_si128(cmp1, 1);
+				// find all instances of .
+				tmpData2 = _mm_cmpeq_epi8(tmpData2, _mm_set1_epi8('.'));
+				// merge matches of \r\n with those for .
+				unsigned int killDots = _mm_movemask_epi8(
+					_mm_and_si128(tmpData2, _mm_or_si128(cmp1, cmp2))
+				);
+				mask |= (killDots << 2) & 0xffff;
+				nextMask = killDots >> (sizeof(__m128i)-2);
+
+				// all that's left is to 'compress' the data (skip over masked chars)
+				unsigned char skipped = BitsSetTable256[mask & 0xff];
+				// lookup compress masks and shuffle
+				// load up two halves
+				__m128i shuf = LOAD_HALVES(unshufLUT + (mask&0xff), unshufLUT + (mask>>8));
+				
+				// offset upper half by 8
+				shuf = _mm_add_epi8(shuf, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+				// shift down upper half into lower
+				// TODO: consider using `mask & 0xff` in table instead of counting bits
+				shuf = _mm_shuffle_epi8(shuf, _mm_load_si128(pshufb_combine_table + skipped));
+				
+				// shuffle data
+				oData = _mm_shuffle_epi8(oData, shuf);
+				STOREU_XMM(p, oData);
+				
+				// increment output position
+				p += XMM_SIZE - skipped - BitsSetTable256[mask >> 8];
+			} else {
+				STOREU_XMM(p, oData);
+				p += XMM_SIZE;
+				escFirst = 0;
+				nextMask = 0;
+			}
+		}
+		
+		if(escFirst) *pState = 1; // escape next character
+		else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n
+		else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r
+		else *pState = 3;
+	}
+	
+	// end alignment
+	if(i < len) {
+		p += decode_scalar(src + i, p, len - i, pState);
+	}
+	
+	return p - dest;
+}
+
+extern size_t (*decode_ssse3)(const unsigned char* src, unsigned char* dest, size_t len, char* state);
+#endif
+
+void init_decode_ssse3() {
+#ifdef __SSSE3__
+	decode_ssse3 = do_decode_ssse3;
+
+	// generate unshuf LUT
+	for(int i=0; i<256; i++) {
+		int k = i;
+		uint8_t res[8];
+		int p = 0;
+		for(int j=0; j<8; j++) {
+			if(!(k & 1)) {
+				res[p++] = j;
+			}
+			k >>= 1;
+		}
+		for(; p<8; p++)
+			res[p] = 0;
+		_mm_storel_epi64((__m128i*)(unshufLUT + i), _mm_loadl_epi64((__m128i*)res));
+	}
+#endif
+}
+
+}
diff --git a/lib/yencode/YEncode.h b/lib/yencode/YEncode.h
new file mode 100644
index 00000000..774614a4
--- /dev/null
+++ b/lib/yencode/YEncode.h
@@ -0,0 +1,38 @@
+/*
+ *  Based on node-yencode library by Anime Tosho:
+ *  https://github.com/animetosho/node-yencode
+ *
+ *  Copyright (C) 2017 Anime Tosho (animetosho)
+ *  Copyright (C) 2017 Andrey Prygunkov <hugbug@users.sourceforge.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#ifndef YENCODE_H
+#define YENCODE_H
+
+namespace YEncode
+{
+
+void init();
+extern size_t (*decode)(const unsigned char* inbuf, unsigned char* outbuf, size_t, char* state);
+extern size_t (*decode_simd)(const unsigned char* inbuf, unsigned char* outbuf, size_t, char* state);
+size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len, char* state);
+extern uint32_t (*crc32_simd)(const unsigned char* src, long len);
+extern uint32_t (*inc_crc32_simd)(uint32_t crc, const unsigned char* src, long len);
+
+}
+
+#endif
diff --git a/nzbget.vcxproj b/nzbget.vcxproj
index edaf80d6..72d9a6b6 100755
--- a/nzbget.vcxproj
+++ b/nzbget.vcxproj
@@ -79,7 +79,7 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>.\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\windows\resources;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\lib\yencode;.\windows\resources;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;PACKAGE="nzbget";VERSION="20.0-testing";_DEBUG;_CONSOLE;DEBUG;_WIN32_WINNT=0x0403;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>false</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -100,7 +100,7 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>.\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\windows\resources;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\lib\yencode;.\windows\resources;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;PACKAGE="nzbget";VERSION="20.0-testing";_DEBUG;_CONSOLE;DEBUG;_WIN32_WINNT=0x0403;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>false</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -119,7 +119,7 @@
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <ClCompile>
-      <AdditionalIncludeDirectories>.\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\windows\resources;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\lib\yencode;.\windows\resources;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;PACKAGE="nzbget";VERSION="20.0-testing";NDEBUG;_CONSOLE;_WIN32_WINNT=0x0403;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <ExceptionHandling>Sync</ExceptionHandling>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
@@ -150,7 +150,7 @@
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
-      <AdditionalIncludeDirectories>.\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\windows\resources;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.\daemon\connect;.\daemon\extension;.\daemon\feed;.\daemon\frontend;.\daemon\main;.\daemon\nserv;.\daemon\nntp;.\daemon\postprocess;.\daemon\queue;.\daemon\remote;.\daemon\util;.\daemon\windows;.\lib\par2;.\lib\yencode;.\windows\resources;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;PACKAGE="nzbget";VERSION="20.0-testing";NDEBUG;_CONSOLE;_WIN32_WINNT=0x0403;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <ExceptionHandling>Sync</ExceptionHandling>
       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
@@ -253,10 +253,7 @@
     <ClCompile Include="daemon\util\Util.cpp" />
     <ClCompile Include="daemon\util\FileSystem.cpp" />
     <ClCompile Include="daemon\windows\StdAfx.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+      <PrecompiledHeader >Create</PrecompiledHeader>
     </ClCompile>
     <ClCompile Include="daemon\windows\WinService.cpp" />
     <ClCompile Include="daemon\windows\WinConsole.cpp" />
@@ -279,6 +276,11 @@
     <ClCompile Include="lib\par2\reedsolomon.cpp" />
     <ClCompile Include="lib\par2\verificationhashtable.cpp" />
     <ClCompile Include="lib\par2\verificationpacket.cpp" />
+    <ClCompile Include="lib\yencode\SimdInit.cpp" />
+    <ClCompile Include="lib\yencode\ScalarDecoder.cpp" />
+    <ClCompile Include="lib\yencode\Sse2Decoder.cpp" />
+    <ClCompile Include="lib\yencode\Ssse3Decoder.cpp" />
+    <ClCompile Include="lib\yencode\PclmulCrc.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="daemon\connect\Connection.h" />
@@ -379,6 +381,7 @@
     <ClInclude Include="lib\par2\reedsolomon.h" />
     <ClInclude Include="lib\par2\verificationhashtable.h" />
     <ClInclude Include="lib\par2\verificationpacket.h" />
+    <ClInclude Include="lib\yencode\YEncode.h" />
     <ClInclude Include="windows\resources\resource.h" />
   </ItemGroup>
   <ItemGroup>