diff --git a/Makefile.am b/Makefile.am
index 77cc7c4f..17f8e38f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -218,6 +218,7 @@ endif
 nzbget_SOURCES += \
 	lib/yencode/YEncode.h \
 	lib/yencode/SimdInit.cpp \
+	lib/yencode/SimdDecoder.cpp \
 	lib/yencode/ScalarDecoder.cpp \
 	lib/yencode/Sse2Decoder.cpp \
 	lib/yencode/Ssse3Decoder.cpp \
diff --git a/Makefile.in b/Makefile.in
index ba6606be..f85fdac2 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -313,13 +313,13 @@ am__nzbget_SOURCES_DIST = daemon/connect/Connection.cpp \
 	lib/par2/verificationhashtable.h \
 	lib/par2/verificationpacket.cpp lib/par2/verificationpacket.h \
 	lib/yencode/YEncode.h lib/yencode/SimdInit.cpp \
-	lib/yencode/ScalarDecoder.cpp lib/yencode/Sse2Decoder.cpp \
-	lib/yencode/Ssse3Decoder.cpp lib/yencode/PclmulCrc.cpp \
-	lib/yencode/NeonDecoder.cpp lib/yencode/AcleCrc.cpp \
-	lib/yencode/SliceCrc.cpp lib/catch/catch.h \
-	tests/suite/TestMain.cpp tests/suite/TestMain.h \
-	tests/suite/TestUtil.cpp tests/suite/TestUtil.h \
-	tests/main/CommandLineParserTest.cpp \
+	lib/yencode/SimdDecoder.cpp lib/yencode/ScalarDecoder.cpp \
+	lib/yencode/Sse2Decoder.cpp lib/yencode/Ssse3Decoder.cpp \
+	lib/yencode/PclmulCrc.cpp lib/yencode/NeonDecoder.cpp \
+	lib/yencode/AcleCrc.cpp lib/yencode/SliceCrc.cpp \
+	lib/catch/catch.h tests/suite/TestMain.cpp \
+	tests/suite/TestMain.h tests/suite/TestUtil.cpp \
+	tests/suite/TestUtil.h tests/main/CommandLineParserTest.cpp \
 	tests/main/OptionsTest.cpp tests/feed/FeedFilterTest.cpp \
 	tests/postprocess/DupeMatcherTest.cpp \
 	tests/postprocess/RarRenamerTest.cpp \
@@ -431,6 +431,7 @@ am_nzbget_OBJECTS = daemon/connect/Connection.$(OBJEXT) \
 	daemon/nserv/NzbGenerator.$(OBJEXT) \
 	daemon/nserv/YEncoder.$(OBJEXT) code_revision.$(OBJEXT) \
 	$(am__objects_1) lib/yencode/SimdInit.$(OBJEXT) \
+	lib/yencode/SimdDecoder.$(OBJEXT) \
 	lib/yencode/ScalarDecoder.$(OBJEXT) \
 	lib/yencode/Sse2Decoder.$(OBJEXT) \
 	lib/yencode/Ssse3Decoder.$(OBJEXT) \
@@ -780,11 +781,11 @@ nzbget_SOURCES = daemon/connect/Connection.cpp \
 	daemon/nserv/NzbGenerator.h daemon/nserv/NzbGenerator.cpp \
 	daemon/nserv/YEncoder.h daemon/nserv/YEncoder.cpp \
 	code_revision.cpp $(am__append_1) lib/yencode/YEncode.h \
-	lib/yencode/SimdInit.cpp lib/yencode/ScalarDecoder.cpp \
-	lib/yencode/Sse2Decoder.cpp lib/yencode/Ssse3Decoder.cpp \
-	lib/yencode/PclmulCrc.cpp lib/yencode/NeonDecoder.cpp \
-	lib/yencode/AcleCrc.cpp lib/yencode/SliceCrc.cpp \
-	$(am__append_2) $(am__append_3)
+	lib/yencode/SimdInit.cpp lib/yencode/SimdDecoder.cpp \
+	lib/yencode/ScalarDecoder.cpp lib/yencode/Sse2Decoder.cpp \
+	lib/yencode/Ssse3Decoder.cpp lib/yencode/PclmulCrc.cpp \
+	lib/yencode/NeonDecoder.cpp lib/yencode/AcleCrc.cpp \
+	lib/yencode/SliceCrc.cpp $(am__append_2) $(am__append_3)
 AM_CPPFLAGS = -I$(srcdir)/daemon/connect -I$(srcdir)/daemon/extension \
 	-I$(srcdir)/daemon/feed -I$(srcdir)/daemon/frontend \
 	-I$(srcdir)/daemon/main -I$(srcdir)/daemon/nntp \
@@ -1345,6 +1346,8 @@ lib/yencode/$(DEPDIR)/$(am__dirstamp):
 	@: > lib/yencode/$(DEPDIR)/$(am__dirstamp)
 lib/yencode/SimdInit.$(OBJEXT): lib/yencode/$(am__dirstamp) \
 	lib/yencode/$(DEPDIR)/$(am__dirstamp)
+lib/yencode/SimdDecoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \
+	lib/yencode/$(DEPDIR)/$(am__dirstamp)
 lib/yencode/ScalarDecoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \
 	lib/yencode/$(DEPDIR)/$(am__dirstamp)
 lib/yencode/Sse2Decoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \
@@ -1610,6 +1613,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/NeonDecoder.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/PclmulCrc.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/ScalarDecoder.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/SimdDecoder.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/SimdInit.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/SliceCrc.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/Sse2Decoder.Po@am__quote@
diff --git a/daemon/main/nzbget.h b/daemon/main/nzbget.h
index b93bbb80..2d96f016 100644
--- a/daemon/main/nzbget.h
+++ b/daemon/main/nzbget.h
@@ -406,7 +406,4 @@ template<class T> typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
 }
 #endif
 
-/* Define to 1 to disable article decoding (for internal test purposes only). */
-//#define SKIP_ARTICLE_DECODING
-
 #endif /* NZBGET_H */
diff --git a/daemon/nntp/Decoder.cpp b/daemon/nntp/Decoder.cpp
index d8a2dd40..e72756f3 100644
--- a/daemon/nntp/Decoder.cpp
+++ b/daemon/nntp/Decoder.cpp
@@ -51,9 +51,6 @@ void Decoder::Clear()
 	m_crcCheck = false;
 	m_lineBuf.Reserve(1024*8);
 	m_lineBuf.SetLength(0);
-	m_extraChar = '\0';
-	m_lastChar1 = '\0';
-	m_lastChar2 = '\0';
 }
 
 /* At the beginning of article the processing goes line by line to find '=ybegin'-marker.
@@ -230,104 +227,37 @@ void Decoder::ProcessYenc(char* buffer, int len)
 	}
 }
 
-// find end of yEnc-data or article data
-char* Decoder::FindStreamEnd(char* buffer, int len)
-{
-	// 0: previous characters are '\r\n' OR there is no previous character
-	if (m_state == 0 && len > 1 &&
-		((buffer[0] == '=' && buffer[1] == 'y') ||
-		 (buffer[0] == '.' && buffer[1] == '\r')))
-	{
-		return buffer;
-	}
-	// 1: previous character is '='
-	if (m_state == 1 && buffer[0] == 'y')
-	{
-		m_extraChar = '=';
-		return buffer;
-	}
-	// 2: previous character is '\r'
-	if (m_state == 2 && len > 2 && buffer[0] == '\n' &&
-		((buffer[1] == '=' && buffer[2] == 'y') ||
-		 (buffer[1] == '.' && buffer[2] == '\r')))
-	{
-		return buffer + 1;
-	}
-
-	// previous characters are '\n.'
-	if (m_lastChar2 == '\n' && m_lastChar1 == '.' && buffer[0] == '\r')
-	{
-		m_extraChar = '.';
-		return buffer;
-	}
-
-	char* last = buffer + len - 1;
-	char* line = buffer;
-	int llen = len;
-	while (char* end = (char*)memchr(line, '\n', llen))
-	{
-		if (end + 2 <= last &&
-			((end[1] == '=' && end[2] == 'y') ||
-			 (end[1] == '.' && end[2] == '\r')))
-		{
-			return end + 1;
-		}
-		llen -= (int)(end - line) + 1;
-		line = end + 1;
-	}
-
-	// save last two characters for future use
-	m_lastChar1 = buffer[len - 1];
-	if (len > 1)
-	{
-		m_lastChar2 = buffer[len - 2];
-	}
-
-	return  nullptr;
-}
-
 int Decoder::DecodeYenc(char* buffer, char* outbuf, int len)
 {
-	int inpLen = len;
-	char* end = FindStreamEnd(buffer, len);
-	if (end)
-	{
-		len = (int)(end - buffer);
-	}
+	const unsigned char* src = (unsigned char*)buffer;
+	unsigned char* dst = (unsigned char*)outbuf;
 
-#ifdef SKIP_ARTICLE_DECODING
-	m_state = m_lastChar2 == '\r' && m_lastChar1 == '\n' ? 0 :
-		m_lastChar1 == '=' ? 1 : m_lastChar1 == '\r' ? 2 : 3;
-#else
-	len = (int)YEncode::decode((const uchar*)buffer, (uchar*)outbuf, len, &m_state);
-#endif
+	int endseq = YEncode::decode(&src, &dst, len, (YEncode::YencDecoderState*)&m_state);
+	int outlen = (int)((char*)dst - outbuf);
 
-	if (end)
+	// endseq:
+	//   0: no end sequence found
+	//   1: \r\n=y sequence found, src points to byte after 'y'
+	//   2: \r\n.\r\n sequence found, src points to byte after last '\n'
+	if (endseq != 0)
 	{
 		// switch back to line mode to process '=yend'- or eof- marker
 		m_lineBuf.SetLength(0);
-		if (m_extraChar)
-		{
-			m_lineBuf.Append(&m_extraChar, 1);
-		}
-		m_lineBuf.Append(end, inpLen - (int)(end - buffer));
+		m_lineBuf.Append(endseq == 1 ? "=y" : ".\r\n");
+		m_lineBuf.Append((const char*)src, len - (int)((const char*)src - buffer));
 		m_body = false;
 	}
 
 	if (m_crcCheck)
 	{
-		m_crc32.Append((uchar*)outbuf, (uint32)len);
+		m_crc32.Append((uchar*)outbuf, (uint32)outlen);
 	}
 
-	return len;
+	return outlen;
 }
 
 Decoder::EStatus Decoder::Check()
 {
-#ifdef SKIP_ARTICLE_DECODING
-	return dsFinished;
-#endif
-
 	switch (m_format)
 	{
 		case efYenc:
diff --git a/daemon/nntp/Decoder.h b/daemon/nntp/Decoder.h
index c0f1939e..ee5be418 100644
--- a/daemon/nntp/Decoder.h
+++ b/daemon/nntp/Decoder.h
@@ -58,11 +58,11 @@ public:
 	bool GetEof() { return m_eof; }
 	const char* GetArticleFilename() { return m_articleFilename; }
 
-private:
+private: 
 	EFormat m_format = efUnknown;
 	bool m_begin;
 	bool m_part;
-	bool m_body;
+	bool m_body; 
 	bool m_end;
 	bool m_crc;
 	uint32 m_expectedCRC;
@@ -76,15 +76,11 @@ private:
 	char m_state;
 	CString m_articleFilename;
 	StringBuilder m_lineBuf;
-	char m_extraChar;
-	char m_lastChar1;
-	char m_lastChar2;
 	Crc32 m_crc32;
 
 	EFormat DetectFormat(const char* buffer, int len);
 	void ProcessYenc(char* buffer, int len);
 	int DecodeYenc(char* buffer, char* outbuf, int len);
-	char* FindStreamEnd(char* buffer, int len);
 	EStatus CheckYenc();
 	int DecodeUx(char* buffer, int len);
 	EStatus CheckUx();
diff --git a/lib/yencode/AcleCrc.cpp b/lib/yencode/AcleCrc.cpp
index 192a2163..d3508514 100644
--- a/lib/yencode/AcleCrc.cpp
+++ b/lib/yencode/AcleCrc.cpp
@@ -87,18 +87,14 @@ uint32_t crc_arm_finish(crc_state *const s)
 {
 	return ~s->crc0[0];
 }
-
-extern void (*crc_init_acle)(crc_state *const s);
-extern void (*crc_incr_acle)(crc_state *const s, const unsigned char *src, long len);
-extern uint32_t (*crc_finish_acle)(crc_state *const s);
 #endif
 
 void init_crc_acle()
 {
 #ifdef __ARM_FEATURE_CRC32
-	crc_init_acle = &crc_arm_init;
-	crc_incr_acle = &crc_arm;
-	crc_finish_acle = &crc_arm_finish;
+	crc_init = &crc_arm_init;
+	crc_incr = &crc_arm;
+	crc_finish = &crc_arm_finish;
 #endif
 }
 
diff --git a/lib/yencode/NeonDecoder.cpp b/lib/yencode/NeonDecoder.cpp
index 14e008f5..a45b7ef0 100644
--- a/lib/yencode/NeonDecoder.cpp
+++ b/lib/yencode/NeonDecoder.cpp
@@ -30,242 +30,20 @@
 
 namespace YEncode
 {
-#ifdef __ARM_NEON
 
-// combine two 8-bit ints into a 16-bit one
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define UINT16_PACK(a, b) ((a) | ((b) << 8))
-#else
-#define UINT16_PACK(a, b) (((a) << 8) | (b))
-#endif
-
-// table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
-static const unsigned char BitsSetTable256[256] = 
+namespace Neon
 {
-#   define B2(n) n,     n+1,     n+1,     n+2
-#   define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2)
-#   define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2)
-    B6(0), B6(1), B6(1), B6(2)
-#undef B2
-#undef B4
-#undef B6
-};
-
-static uint16_t neon_movemask(uint8x16_t in) {
-	uint8x16_t mask = vandq_u8(in, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
-# if defined(__aarch64__) && 0
-	// TODO: is this better?
-	return (vaddv_u8(vget_high_u8(mask)) << 8) | vaddv_u8(vget_low_u8(mask));
-# else
-	uint8x8_t res = vpadd_u8(vget_low_u8(mask), vget_high_u8(mask));
-	res = vpadd_u8(res, res);
-	res = vpadd_u8(res, res);
-	return vget_lane_u16(vreinterpret_u16_u8(res), 0);
-# endif
-}
-
-uint8_t eqFixLUT[256];
-alignas(32) uint8x8_t eqAddLUT[256];
-alignas(32) uint8x8_t unshufLUT[256];
-alignas(32) static const uint8_t pshufb_combine_table[272] = {
-	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
-	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,
-	0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,
-	0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,
-	0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,
-	0x00,0x01,0x02,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,
-	0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,
-	0x00,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
-	0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
-};
-
-size_t do_decode_neon(const unsigned char* src, unsigned char* dest, size_t len, char* state) {
-	if(len <= sizeof(uint8x16_t)*2) return decode_scalar(src, dest, len, state);
-	
-	unsigned char *p = dest; // destination pointer
-	unsigned long i = 0; // input position
-	unsigned char escFirst = 0; // input character; first char needs escaping
-	unsigned int nextMask = 0;
-	char tState = 0;
-	char* pState = state ? state : &tState;
-	if((uintptr_t)src & ((sizeof(uint8x16_t)-1))) {
-		// find source memory alignment
-		unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(uint8x16_t)-1)) & ~(sizeof(uint8x16_t)-1));
-		
-		i = aSrc - src;
-		p += decode_scalar(src, dest, i, pState);
-	}
-	
-	// handle finicky case of \r\n. straddled across initial boundary
-	if(*pState == 0 && i+1 < len && src[i] == '.')
-		nextMask = 1;
-	else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.'))
-		nextMask = 2;
-
-	escFirst = *pState == 1;
-	
-	if(i + (sizeof(uint8x16_t)+1) < len) {
-		// our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
-		size_t dLen = len - (sizeof(uint8x16_t)+1);
-		dLen = ((dLen-i) + 0xf) & ~0xf;
-		uint8_t* dSrc = (uint8_t*)src + dLen + i;
-		long dI = -dLen;
-		i += dLen;
-		
-		for(; dI; dI += sizeof(uint8x16_t)) {
-			uint8x16_t data = vld1q_u8(dSrc + dI);
-			
-			// search for special chars
-			uint8x16_t cmpEq = vceqq_u8(data, vdupq_n_u8('=')),
-			cmp = vorrq_u8(
-				vorrq_u8(
-					vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0a0d))), // \r\n
-					vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0d0a)))  // \n\r
-				),
-				cmpEq
-			);
-			uint16_t mask = neon_movemask(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later
-			
-			uint8x16_t oData;
-			if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers?
-				// first byte needs escaping due to preceeding = in last loop iteration
-				oData = vsubq_u8(data, (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42});
-			} else {
-				oData = vsubq_u8(data, vdupq_n_u8(42));
-			}
-			mask &= ~escFirst;
-			mask |= nextMask;
-			
-			if (mask != 0) {
-				// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
-				// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
-				
-				// firstly, resolve invalid sequences of = to deal with cases like '===='
-				uint16_t maskEq = neon_movemask(cmpEq);
-				uint16_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
-				maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp;
-				
-				escFirst = (maskEq >> (sizeof(uint8x16_t)-1));
-				// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
-				maskEq <<= 1;
-				mask &= ~maskEq;
-				
-				// unescape chars following `=`
-				oData = vaddq_u8(
-					oData,
-					vcombine_u8(
-						vld1_u8((uint8_t*)(eqAddLUT + (maskEq&0xff))),
-						vld1_u8((uint8_t*)(eqAddLUT + ((maskEq>>8)&0xff)))
-					)
-				);
-				
-				// handle \r\n. sequences
-				// RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
-				// find instances of \r\n
-				uint8x16_t tmpData1, tmpData2;
-				uint8x16_t nextData = vld1q_u8(dSrc + dI + sizeof(uint8x16_t));
-				tmpData1 = vextq_u8(data, nextData, 1);
-				tmpData2 = vextq_u8(data, nextData, 2);
-				uint8x16_t cmp1 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(data), vdupq_n_u16(0x0a0d)));
-				uint8x16_t cmp2 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData1), vdupq_n_u16(0x0a0d)));
-				// prepare to merge the two comparisons
-				cmp1 = vextq_u8(cmp1, vdupq_n_u8(0), 1);
-				// find all instances of .
-				tmpData2 = vceqq_u8(tmpData2, vdupq_n_u8('.'));
-				// merge matches of \r\n with those for .
-				uint16_t killDots = neon_movemask(
-					vandq_u8(tmpData2, vorrq_u8(cmp1, cmp2))
-				);
-				mask |= (killDots << 2) & 0xffff;
-				nextMask = killDots >> (sizeof(uint8x16_t)-2);
-
-				// all that's left is to 'compress' the data (skip over masked chars)
-				unsigned char skipped = BitsSetTable256[mask & 0xff];
-				// lookup compress masks and shuffle
-				oData = vcombine_u8(
-					vtbl1_u8(vget_low_u8(oData),  vld1_u8((uint8_t*)(unshufLUT + (mask&0xff)))),
-					vtbl1_u8(vget_high_u8(oData), vld1_u8((uint8_t*)(unshufLUT + (mask>>8))))
-				);
-				// compact down
-				uint8x16_t compact = vld1q_u8(pshufb_combine_table + skipped*sizeof(uint8x16_t));
-# ifdef __aarch64__
-				oData = vqtbl1q_u8(oData, compact);
-# else
-				uint8x8x2_t dataH = {vget_low_u8(oData), vget_high_u8(oData)};
-				oData = vcombine_u8(vtbl2_u8(dataH, vget_low_u8(compact)),
-				                    vtbl2_u8(dataH, vget_high_u8(compact)));
-# endif
-				vst1q_u8(p, oData);
-				
-				// increment output position
-				p += sizeof(uint8x16_t) - skipped - BitsSetTable256[mask >> 8];
-				
-			} else {
-				vst1q_u8(p, oData);
-				p += sizeof(uint8x16_t);
-				escFirst = 0;
-				nextMask = 0;
-			}
-		}
-		
-		if(escFirst) *pState = 1; // escape next character
-		else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n
-		else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r
-		else *pState = 3;
-	}
-	
-	// end alignment
-	if(i < len) {
-		p += decode_scalar(src + i, p, len - i, pState);
-	}
-	
-	return p - dest;
-}
-
-extern size_t (*decode_neon)(const unsigned char* src, unsigned char* dest, size_t len, char* state);
+#ifdef __ARM_NEON
+#define SIMD_DECODER
+#include "SimdDecoder.cpp"
 #endif
+}
 
 void init_decode_neon() {
 #ifdef __ARM_NEON
-	decode_neon = &do_decode_neon;
-
-	for(int i=0; i<256; i++) {
-		int k = i;
-		uint8_t res[8];
-		int p = 0;
-		
-		// fix LUT
-		k = i;
-		p = 0;
-		for(int j=0; j<8; j++) {
-			k = i >> j;
-			if(k & 1) {
-				p |= 1 << j;
-				j++;
-			}
-		}
-		eqFixLUT[i] = p;
-		
-		// sub LUT
-		k = i;
-		for(int j=0; j<8; j++) {
-			res[j] = (k & 1) ? 192 /* == -64 */ : 0;
-			k >>= 1;
-		}
-		vst1_u8((uint8_t*)(eqAddLUT + i), vld1_u8(res));
-		
-		k = i;
-		p = 0;
-		for(int j=0; j<8; j++) {
-			if(!(k & 1)) {
-				res[p++] = j;
-			}
-			k >>= 1;
-		}
-		for(; p<8; p++)
-			res[p] = 0;
-		vst1_u8((uint8_t*)(unshufLUT + i), vld1_u8(res));
-	}
+	decode = &YEncode::Neon::do_decode_simd<sizeof(uint8x16_t), YEncode::Neon::do_decode_neon>;
+	YEncode::Neon::decoder_init();
+	decode_simd = true;
 #endif
 }
 
diff --git a/lib/yencode/PclmulCrc.cpp b/lib/yencode/PclmulCrc.cpp
index 39d40954..3b2ac32c 100644
--- a/lib/yencode/PclmulCrc.cpp
+++ b/lib/yencode/PclmulCrc.cpp
@@ -37,6 +37,8 @@
 
 #include "nzbget.h"
 
+#include "YEncode.h"
+
 #ifdef __PCLMUL__
 #include <immintrin.h>
 #endif
@@ -45,11 +47,6 @@ namespace YEncode
 {
 #ifdef __PCLMUL__
 
-struct crc_state
-{
-	alignas(16) unsigned crc0[4 * 5];
-};
-
 void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
 	const __m128i xmm_fold4 = _mm_set_epi32(
 			0x00000001, 0x54442bd4,
@@ -459,18 +456,15 @@ uint32_t crc_fold_512to32(crc_state *const s) {
     return ~crc;
     CRC_SAVE(s)
 }
-
-extern void (*crc_init_pclmul)(crc_state *const s);
-extern void (*crc_incr_pclmul)(crc_state *const s, const unsigned char *src, long len);
-extern uint32_t (*crc_finish_pclmul)(crc_state *const s);
 #endif
 
 void init_crc_pclmul()
 {
 #ifdef __PCLMUL__
-	crc_init_pclmul = &crc_fold_init;
-	crc_incr_pclmul = &crc_fold;
-	crc_finish_pclmul = &crc_fold_512to32;
+	crc_init = &crc_fold_init;
+	crc_incr = &crc_fold;
+	crc_finish = &crc_fold_512to32;
+	crc_simd = true;
 #endif
 }
 
diff --git a/lib/yencode/ScalarDecoder.cpp b/lib/yencode/ScalarDecoder.cpp
index e44ec134..9fbb98b1 100644
--- a/lib/yencode/ScalarDecoder.cpp
+++ b/lib/yencode/ScalarDecoder.cpp
@@ -22,8 +22,7 @@
 
 #include "nzbget.h"
 
-namespace YEncode
-{
+#include "YEncode.h"
 
 // combine two 8-bit ints into a 16-bit one
 #if __BYTE_ORDER == __LITTLE_ENDIAN
@@ -32,55 +31,135 @@ namespace YEncode
 #define UINT16_PACK(a, b) (((a) << 8) | (b))
 #endif
 
-// state var: refers to the previous state - only used for incremental processing
-//   0: previous characters are `\r\n` OR there is no previous character
-//   1: previous character is `=`
-//   2: previous character is `\r`
-//   3: previous character is none of the above
-size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len, char* state) {
-	unsigned char *es = (unsigned char*)src + len; // end source pointer
-	unsigned char *p = dest; // destination pointer
+namespace YEncode
+{
+
+// return values:
+// - 0: no end sequence found
+// - 1: \r\n=y sequence found, src points to byte after 'y'
+// - 2: \r\n.\r\n sequence found, src points to byte after last '\n'
+int decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
+	const unsigned char *es = (*src) + len; // end source pointer
+	unsigned char *p = *dest; // destination pointer
 	long i = -(long)len; // input position
 	unsigned char c; // input character
-
-	if (len < 1) return 0;
-
-	if (state) switch (*state) {
-		case 1:
+	
+	if(len < 1) return 0;
+	
+#define YDEC_CHECK_END(s) if(i == 0) { \
+	*state = s; \
+	*src = es; \
+	*dest = p; \
+	return 0; \
+}
+	if(state) switch(*state) {
+		case YDEC_STATE_CRLFEQ: do_decode_endable_scalar_ceq:
+			if(es[i] == 'y') {
+				*state = YDEC_STATE_NONE;
+				*src = es+i+1;
+				*dest = p;
+				return 1;
+			} // else fall thru and escape
+		case YDEC_STATE_EQ:
 			c = es[i];
 			*p++ = c - 42 - 64;
 			i++;
-			if (c == '\r' && i < 0) {
-				*state = 2;
-				// fall through to case 2
-			}
-			else {
-				*state = 3;
-				break;
-			}
-		case 2:
-			if (es[i] != '\n') break;
+			if(c != '\r') break;
+			YDEC_CHECK_END(YDEC_STATE_CR)
+			// fall through
+		case YDEC_STATE_CR:
+			if(es[i] != '\n') break;
 			i++;
-			*state = 0; // now `\r\n`
-			if (i >= 0) return 0;
-		case 0:
-			// skip past first dot
-			if (es[i] == '.') i++;
-	}
-	else // treat as *state == 0
-		if (es[i] == '.') i++;
-
-	for (; i < -2; i++) {
+			YDEC_CHECK_END(YDEC_STATE_CRLF)
+		case YDEC_STATE_CRLF: do_decode_endable_scalar_c0:
+			if(es[i] == '.') {
+				i++;
+				YDEC_CHECK_END(YDEC_STATE_CRLFDT)
+				// fallthru
+			} else if(es[i] == '=') {
+				i++;
+				YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
+				goto do_decode_endable_scalar_ceq;
+			} else
+				break;
+		case YDEC_STATE_CRLFDT:
+			if(es[i] == '\r') {
+				i++;
+				YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
+				// fallthru
+			} else if(es[i] == '=') { // check for dot-stuffed ending: \r\n.=y
+				i++;
+				YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
+				goto do_decode_endable_scalar_ceq;
+			} else
+				break;
+		case YDEC_STATE_CRLFDTCR:
+			if(es[i] == '\n') {
+				*state = YDEC_STATE_CRLF;
+				*src = es + i + 1;
+				*dest = p;
+				return 2;
+			} else
+				break;
+		case YDEC_STATE_NONE: break; // silence compiler warning
+	} else // treat as YDEC_STATE_CRLF
+		goto do_decode_endable_scalar_c0;
+	
+	for(; i < -2; i++) {
 		c = es[i];
-		switch (c) {
-			case '\r':
-				// skip past \r\n. sequences
-				if (*(uint16_t*)(es + i + 1) == UINT16_PACK('\n', '.'))
-					i += 2;
-			case '\n':
+		switch(c) {
+			case '\r': {
+				uint16_t next = *(uint16_t*)(es + i + 1);
+				if(next == UINT16_PACK('\n', '.')) {
+					// skip past \r\n. sequences
+					i += 3;
+					YDEC_CHECK_END(YDEC_STATE_CRLFDT)
+					// check for end
+					if(es[i] == '\r') {
+						i++;
+						YDEC_CHECK_END(YDEC_STATE_CRLFDTCR)
+						if(es[i] == '\n') {
+							*src = es + i + 1;
+							*dest = p;
+							*state = YDEC_STATE_CRLF;
+							return 2;
+						} else i--;
+					} else if(es[i] == '=') {
+						i++;
+						YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
+						if(es[i] == 'y') {
+							*src = es + i + 1;
+							*dest = p;
+							*state = YDEC_STATE_NONE;
+							return 1;
+						} else {
+							// escape char & continue
+							c = es[i];
+							*p++ = c - 42 - 64;
+							i -= (c == '\r');
+						}
+					} else i--;
+				}
+				else if(next == UINT16_PACK('\n', '=')) {
+					i += 3;
+					YDEC_CHECK_END(YDEC_STATE_CRLFEQ)
+					if(es[i] == 'y') {
+						// ended
+						*src = es + i + 1;
+						*dest = p;
+						*state = YDEC_STATE_NONE;
+						return 1;
+					} else {
+						// escape char & continue
+						c = es[i];
+						*p++ = c - 42 - 64;
+						i -= (c == '\r');
+					}
+				}
+			} case '\n':
 				continue;
 			case '=':
-				c = es[i + 1];
+				c = es[i+1];
 				*p++ = c - 42 - 64;
 				i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case
 				continue;
@@ -88,20 +167,22 @@ size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len,
 				*p++ = c - 42;
 		}
 	}
-	if (state) *state = 3;
-
-	if (i == -2) { // 2nd last char
+	if(state) *state = YDEC_STATE_NONE;
+	
+	if(i == -2) { // 2nd last char
 		c = es[i];
-		switch (c) {
+		switch(c) {
 			case '\r':
-				if (state && es[i + 1] == '\n') {
-					*state = 0;
-					return p - dest;
+				if(state && es[i+1] == '\n') {
+					*state = YDEC_STATE_CRLF;
+					*src = es;
+					*dest = p;
+					return 0;
 				}
 			case '\n':
 				break;
 			case '=':
-				c = es[i + 1];
+				c = es[i+1];
 				*p++ = c - 42 - 64;
 				i += (c != '\r');
 				break;
@@ -110,21 +191,27 @@ size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len,
 		}
 		i++;
 	}
-
+	
 	// do final char; we process this separately to prevent an overflow if the final char is '='
-	if (i == -1) {
+	if(i == -1) {
 		c = es[i];
-		if (c != '\n' && c != '\r' && c != '=') {
+		if(c != '\n' && c != '\r' && c != '=') {
 			*p++ = c - 42;
-		}
-		else if (state) {
-			if (c == '=') *state = 1;
-			else if (c == '\r') *state = 2;
-			else *state = 3;
+		} else if(state) {
+			if(c == '=') *state = YDEC_STATE_EQ;
+			else if(c == '\r') *state = YDEC_STATE_CR;
+			else *state = YDEC_STATE_NONE;
 		}
 	}
+#undef YDEC_CHECK_END
+	
+	*src = es;
+	*dest = p;
+	return 0;
+}
 
-	return p - dest;
+void init_decode_scalar() {
+	decode = decode_scalar;
 }
 
 }
diff --git a/lib/yencode/SimdDecoder.cpp b/lib/yencode/SimdDecoder.cpp
new file mode 100644
index 00000000..98867175
--- /dev/null
+++ b/lib/yencode/SimdDecoder.cpp
@@ -0,0 +1,686 @@
+/*
+ *  Based on node-yencode library by Anime Tosho:
+ *  https://github.com/animetosho/node-yencode
+ *
+ *  Copyright (C) 2017 Anime Tosho (animetosho)
+ *  Copyright (C) 2017 Andrey Prygunkov <hugbug@users.sourceforge.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#ifdef SIMD_DECODER
+
+#ifdef WIN32
+#define FORCE_INLINE __forceinline
+#else
+#define FORCE_INLINE __attribute__((always_inline))
+#endif
+
+// combine two 8-bit ints into a 16-bit one
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define UINT16_PACK(a, b) ((a) | ((b) << 8))
+#define UINT32_PACK(a, b, c, d) ((a) | ((b) << 8) | ((c) << 16) | ((d) << 24))
+#else
+#define UINT16_PACK(a, b) (((a) << 8) | (b))
+#define UINT32_PACK(a, b, c, d) (((a) << 24) | ((b) << 16) | ((c) << 8) | (d))
+#endif
+
+// table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
+static const unsigned char BitsSetTable256[256] = 
+{
+#   define B2(n) n,     n+1,     n+1,     n+2
+#   define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2)
+#   define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2)
+    B6(0), B6(1), B6(1), B6(2)
+#undef B2
+#undef B4
+#undef B6
+};
+
+template<int width, typename Kernel>
+int do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) {
+	if(len <= width*2) return decode_scalar(src, dest, len, state);
+	
+	YencDecoderState tState = YDEC_STATE_CRLF;
+	YencDecoderState* pState = state ? state : &tState;
+	if((uintptr_t)(*src) & ((width-1))) {
+		// find source memory alignment
+		unsigned char* aSrc = (unsigned char*)(((uintptr_t)(*src) + (width-1)) & ~(width-1));
+		int amount = (int)(aSrc - *src);
+		len -= amount;
+		int ended = decode_scalar(src, dest, amount, pState);
+		if(ended) return ended;
+	}
+	
+	size_t lenBuffer = width -1;
+	lenBuffer += 3 + 1;
+
+	if(len > lenBuffer) {
+		unsigned char *p = *dest; // destination pointer
+		unsigned char escFirst = 0; // input character; first char needs escaping
+		uint16_t nextMask = 0;
+		// handle finicky case of special sequences straddled across initial boundary
+		switch(*pState) {
+			case YDEC_STATE_CRLF:
+				if(**src == '.') {
+					nextMask = 1;
+					if(*(uint16_t*)(*src +1) == UINT16_PACK('\r','\n')) {
+						(*src) += 3;
+						*pState = YDEC_STATE_CRLF;
+						return 2;
+					}
+					if(*(uint16_t*)(*src +1) == UINT16_PACK('=','y')) {
+						(*src) += 3;
+						*pState = YDEC_STATE_NONE;
+						return 1;
+					}
+				}
+				else if(*(uint16_t*)(*src) == UINT16_PACK('=','y')) {
+					(*src) += 2;
+					*pState = YDEC_STATE_NONE;
+					return 1;
+				}
+				break;
+			case YDEC_STATE_CR:
+				if(*(uint16_t*)(*src) == UINT16_PACK('\n','.')) {
+					nextMask = 2;
+					if(*(uint16_t*)(*src +2) == UINT16_PACK('\r','\n')) {
+						(*src) += 4;
+						*pState = YDEC_STATE_CRLF;
+						return 2;
+					}
+					if(*(uint16_t*)(*src +2) == UINT16_PACK('=','y')) {
+						(*src) += 4;
+						*pState = YDEC_STATE_NONE;
+						return 1;
+					}
+				}
+				else if((*(uint32_t*)(*src) & 0xffffff) == UINT32_PACK('\n','=','y',0)) {
+					(*src) += 3;
+					*pState = YDEC_STATE_NONE;
+					return 1;
+				}
+				break;
+			case YDEC_STATE_CRLFDT:
+				if(*(uint16_t*)(*src) == UINT16_PACK('\r','\n')) {
+					(*src) += 2;
+					*pState = YDEC_STATE_CRLF;
+					return 2;
+				}
+				if(*(uint16_t*)(*src) == UINT16_PACK('=','y')) {
+					(*src) += 2;
+					*pState = YDEC_STATE_NONE;
+					return 1;
+				}
+				break;
+			case YDEC_STATE_CRLFDTCR:
+				if(**src == '\n') {
+					(*src) += 1;
+					*pState = YDEC_STATE_CRLF;
+					return 2;
+				}
+				break;
+			case YDEC_STATE_CRLFEQ:
+				if(**src == 'y') {
+					(*src) += 1;
+					*pState = YDEC_STATE_NONE;
+					return 1;
+				}
+				break;
+			default: break; // silence compiler warning
+		}
+		escFirst = (*pState == YDEC_STATE_EQ || *pState == YDEC_STATE_CRLFEQ);
+		
+		// our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
+		size_t dLen = len - lenBuffer;
+		dLen = (dLen + (width-1)) & ~(width-1);
+		const uint8_t* dSrc = (const uint8_t*)(*src) + dLen;
+
+		Kernel::do_decode(dLen, dSrc, p, escFirst, nextMask);
+
+		if(escFirst) *pState = YDEC_STATE_EQ; // escape next character
+		else if(nextMask == 1) *pState = YDEC_STATE_CRLF; // next character is '.', where previous two were \r\n
+		else if(nextMask == 2) *pState = YDEC_STATE_CR; // next characters are '\n.', previous is \r
+		else *pState = YDEC_STATE_NONE;
+		
+		*src += dLen;
+		len -= dLen;
+		*dest = p;
+	}
+	
+	// end alignment
+	if(len)
+		return decode_scalar(src, dest, len, pState);
+
+	return 0;
+}
+
+alignas(32) static const uint8_t pshufb_combine_table[272] = {
+	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,
+	0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,
+	0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,
+	0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,
+	0x00,0x01,0x02,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,
+	0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,
+	0x00,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
+	0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
+};
+
+#ifdef __SSE2__
+
+#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/
+
+#if defined(__tune_core2__) || defined(__tune_atom__)
+/* on older Intel CPUs, plus first gen Atom, it is faster to store XMM registers in half */
+# define STOREU_XMM(dest, xmm) \
+  _mm_storel_epi64((__m128i*)(dest), xmm); \
+  _mm_storeh_pi(((__m64*)(dest) +1), _mm_castsi128_ps(xmm))
+#else
+# define STOREU_XMM(dest, xmm) \
+  _mm_storeu_si128((__m128i*)(dest), xmm)
+#endif
+
+#define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \
+	_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \
+	(b) \
+))
+
+uint8_t eqFixLUT[256];
+alignas(32) __m64 eqAddLUT[256];
+#ifdef __SSSE3__
+alignas(32) __m64 unshufLUT[256];
+#endif
+
+template<bool use_ssse3>
+struct do_decode_sse {
+FORCE_INLINE
+static inline void do_decode(size_t& dLen, const uint8_t* dSrc, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
+	long dI = -(long)dLen;
+
+	for(; dI; dI += sizeof(__m128i)) {
+		const uint8_t* src = dSrc + dI;
+
+		__m128i data = _mm_load_si128((__m128i *)src);
+		
+		// search for special chars
+		__m128i cmpEq = _mm_cmpeq_epi8(data, _mm_set1_epi8('=')),
+#ifdef __AVX512VL__
+		cmp = _mm_ternarylogic_epi32(
+			_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)),
+			_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a)),
+			cmpEq,
+			0xFE
+		);
+#else
+		cmp = _mm_or_si128(
+			_mm_or_si128(
+				_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), // \r\n
+				_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a))  // \n\r
+			),
+			cmpEq
+		);
+#endif
+		uint16_t mask = _mm_movemask_epi8(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later
+		
+		__m128i oData;
+		if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers?
+			// first byte needs escaping due to preceeding = in last loop iteration
+			oData = _mm_sub_epi8(data, _mm_set_epi8(42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42+64));
+			mask &= ~1;
+		} else {
+			oData = _mm_sub_epi8(data, _mm_set1_epi8(42));
+		}
+		mask |= nextMask;
+		
+		if (mask != 0) {
+			// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
+			// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
+			
+			// firstly, resolve invalid sequences of = to deal with cases like '===='
+			uint16_t maskEq = _mm_movemask_epi8(cmpEq);
+			uint16_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
+			maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp;
+			
+			unsigned char oldEscFirst = escFirst;
+			escFirst = (maskEq >> (sizeof(__m128i)-1));
+			// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
+			maskEq <<= 1;
+			mask &= ~maskEq;
+			
+			// unescape chars following `=`
+#if defined(__AVX512VL__) && defined(__AVX512BW__)
+			// GCC < 7 seems to generate rubbish assembly for this
+			oData = _mm_mask_add_epi8(
+				oData,
+				maskEq,
+				oData,
+				_mm_set1_epi8(-64)
+			);
+#else
+			oData = _mm_add_epi8(
+				oData,
+				LOAD_HALVES(
+					eqAddLUT + (maskEq&0xff),
+					eqAddLUT + ((maskEq>>8)&0xff)
+				)
+			);
+#endif
+			
+			// handle \r\n. sequences
+			// RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
+			// find instances of \r\n
+			__m128i tmpData1, tmpData2, tmpData3, tmpData4;
+#if defined(__SSSE3__) && !defined(__tune_btver1__)
+			if(use_ssse3) {
+				__m128i nextData = _mm_load_si128((__m128i *)src + 1);
+				tmpData1 = _mm_alignr_epi8(nextData, data, 1);
+				tmpData2 = _mm_alignr_epi8(nextData, data, 2);
+				tmpData3 = _mm_alignr_epi8(nextData, data, 3);
+				tmpData4 = _mm_alignr_epi8(nextData, data, 4);
+			} else {
+#endif
+				tmpData1 = _mm_insert_epi16(_mm_srli_si128(data, 1), *(uint16_t*)(src + sizeof(__m128i)-1), 7);
+				tmpData2 = _mm_insert_epi16(_mm_srli_si128(data, 2), *(uint16_t*)(src + sizeof(__m128i)), 7);
+				tmpData3 = _mm_insert_epi16(_mm_srli_si128(tmpData1, 2), *(uint16_t*)(src + sizeof(__m128i)+1), 7);
+				tmpData4 = _mm_insert_epi16(_mm_srli_si128(tmpData2, 2), *(uint16_t*)(src + sizeof(__m128i)+2), 7);
+#ifdef __SSSE3__
+			}
+#endif
+			__m128i matchNl1 = _mm_cmpeq_epi16(data, _mm_set1_epi16(0x0a0d));
+			__m128i matchNl2 = _mm_cmpeq_epi16(tmpData1, _mm_set1_epi16(0x0a0d));
+			
+			__m128i matchDots, matchNlDots;
+			uint16_t killDots;
+			matchDots = _mm_cmpeq_epi8(tmpData2, _mm_set1_epi8('.'));
+			// merge preparation (for non-raw, it doesn't matter if this is shifted or not)
+			matchNl1 = _mm_srli_si128(matchNl1, 1);
+			
+			// merge matches of \r\n with those for .
+#ifdef __AVX512VL__
+			matchNlDots = _mm_ternarylogic_epi32(matchDots, matchNl1, matchNl2, 0xE0);
+#else
+			matchNlDots = _mm_and_si128(matchDots, _mm_or_si128(matchNl1, matchNl2));
+#endif
+			killDots = _mm_movemask_epi8(matchNlDots);
+
+			__m128i cmpB1 = _mm_cmpeq_epi16(tmpData2, _mm_set1_epi16(0x793d)); // "=y"
+			__m128i cmpB2 = _mm_cmpeq_epi16(tmpData3, _mm_set1_epi16(0x793d));
+			if(killDots) {
+				// match instances of \r\n.\r\n and \r\n.=y
+				__m128i cmpC1 = _mm_cmpeq_epi16(tmpData3, _mm_set1_epi16(0x0a0d)); // "\r\n"
+				__m128i cmpC2 = _mm_cmpeq_epi16(tmpData4, _mm_set1_epi16(0x0a0d));
+				cmpC1 = _mm_or_si128(cmpC1, cmpB2);
+				cmpC2 = _mm_or_si128(cmpC2, _mm_cmpeq_epi16(tmpData4, _mm_set1_epi16(0x793d)));
+				cmpC2 = _mm_slli_si128(cmpC2, 1);
+				
+				// prepare cmpB
+				cmpB1 = _mm_and_si128(cmpB1, matchNl1);
+				cmpB2 = _mm_and_si128(cmpB2, matchNl2);
+				
+				// and w/ dots
+#ifdef __AVX512VL__
+				cmpC1 = _mm_ternarylogic_epi32(cmpC1, cmpC2, matchNlDots, 0xA8);
+				cmpB1 = _mm_ternarylogic_epi32(cmpB1, cmpB2, cmpC1, 0xFE);
+#else
+				cmpC1 = _mm_and_si128(_mm_or_si128(cmpC1, cmpC2), matchNlDots);
+				cmpB1 = _mm_or_si128(cmpC1, _mm_or_si128(
+					cmpB1, cmpB2
+				));
+#endif
+			} else {
+#ifdef __AVX512VL__
+				cmpB1 = _mm_ternarylogic_epi32(cmpB1, matchNl1, _mm_and_si128(cmpB2, matchNl2), 0xEA);
+#else
+				cmpB1 = _mm_or_si128(
+					_mm_and_si128(cmpB1, matchNl1),
+					_mm_and_si128(cmpB2, matchNl2)
+				);
+#endif
+			}
+			if(_mm_movemask_epi8(cmpB1)) {
+				// terminator found
+				// there's probably faster ways to do this, but reverting to scalar code should be good enough
+				escFirst = oldEscFirst;
+				dLen += dI;
+				return;
+			}
+			mask |= (killDots << 2) & 0xffff;
+			nextMask = killDots >> (sizeof(__m128i)-2);
+
+			// all that's left is to 'compress' the data (skip over masked chars)
+#ifdef __SSSE3__
+			if(use_ssse3) {
+# if defined(__POPCNT__) && (defined(__tune_znver1__) || defined(__tune_btver2__))
+				unsigned char skipped = _mm_popcnt_u32(mask & 0xff);
+# else
+				unsigned char skipped = BitsSetTable256[mask & 0xff];
+# endif
+				// lookup compress masks and shuffle
+				// load up two halves
+				__m128i shuf = LOAD_HALVES(unshufLUT + (mask&0xff), unshufLUT + (mask>>8));
+				
+				// offset upper half by 8
+				shuf = _mm_add_epi8(shuf, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+				// shift down upper half into lower
+				// TODO: consider using `mask & 0xff` in table instead of counting bits
+				shuf = _mm_shuffle_epi8(shuf, _mm_load_si128((const __m128i*)pshufb_combine_table + skipped));
+				
+				// shuffle data
+				oData = _mm_shuffle_epi8(oData, shuf);
+				STOREU_XMM(p, oData);
+				
+				// increment output position
+# if defined(__POPCNT__) && !defined(__tune_btver1__)
+				p += XMM_SIZE - _mm_popcnt_u32(mask);
+# else
+				p += XMM_SIZE - skipped - BitsSetTable256[mask >> 8];
+# endif
+				
+			} else {
+#endif
+				alignas(32) uint32_t mmTmp[4];
+				_mm_store_si128((__m128i*)mmTmp, oData);
+				
+				for(int j=0; j<4; j++) {
+					if(mask & 0xf) {
+						unsigned char* pMmTmp = (unsigned char*)(mmTmp + j);
+						unsigned int maskn = ~mask;
+						*p = pMmTmp[0];
+						p += (maskn & 1);
+						*p = pMmTmp[1];
+						p += (maskn & 2) >> 1;
+						*p = pMmTmp[2];
+						p += (maskn & 4) >> 2;
+						*p = pMmTmp[3];
+						p += (maskn & 8) >> 3;
+					} else {
+						*(uint32_t*)p = mmTmp[j];
+						p += 4;
+					}
+					mask >>= 4;
+				}
+#ifdef __SSSE3__
+			}
+#endif
+		} else {
+			STOREU_XMM(p, oData);
+			p += XMM_SIZE;
+			escFirst = 0;
+			nextMask = 0;
+		}
+	}
+}
+};
+#endif
+
+
+#ifdef __ARM_NEON
+inline uint16_t neon_movemask(uint8x16_t in) {
+	uint8x16_t mask = vandq_u8(in, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128});
+# if defined(__aarch64__)
+	return (vaddv_u8(vget_high_u8(mask)) << 8) | vaddv_u8(vget_low_u8(mask));
+# else
+	uint8x8_t res = vpadd_u8(vget_low_u8(mask), vget_high_u8(mask));
+	res = vpadd_u8(res, res);
+	res = vpadd_u8(res, res);
+	return vget_lane_u16(vreinterpret_u16_u8(res), 0);
+# endif
+}
+
+uint8_t eqFixLUT[256];
+alignas(32) uint8x8_t eqAddLUT[256];
+alignas(32) uint8x8_t unshufLUT[256];
+
+struct do_decode_neon {
+FORCE_INLINE
+static inline void do_decode(size_t& dLen, const uint8_t* dSrc, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) {
+	long dI = -(long)dLen;
+
+	for(; dI; dI += sizeof(uint8x16_t)) {
+		const uint8_t* src = dSrc + dI;
+
+		uint8x16_t data = vld1q_u8(src);
+		
+		// search for special chars
+		uint8x16_t cmpEq = vceqq_u8(data, vdupq_n_u8('=')),
+		cmp = vorrq_u8(
+			vorrq_u8(
+				vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0a0d))), // \r\n
+				vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0d0a)))  // \n\r
+			),
+			cmpEq
+		);
+		uint16_t mask = neon_movemask(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later
+		
+		uint8x16_t oData;
+		if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers?
+			// first byte needs escaping due to preceeding = in last loop iteration
+			oData = vsubq_u8(data, (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42});
+			mask &= ~1;
+		} else {
+			oData = vsubq_u8(data, vdupq_n_u8(42));
+		}
+		mask |= nextMask;
+		
+		if (mask != 0) {
+			// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
+			// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
+			
+			// firstly, resolve invalid sequences of = to deal with cases like '===='
+			uint16_t maskEq = neon_movemask(cmpEq);
+			uint16_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
+			maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp;
+			
+			unsigned char oldEscFirst = escFirst;
+			escFirst = (maskEq >> (sizeof(uint8x16_t)-1));
+			// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
+			maskEq <<= 1;
+			mask &= ~maskEq;
+			
+			// unescape chars following `=`
+			oData = vaddq_u8(
+				oData,
+				vcombine_u8(
+					vld1_u8((uint8_t*)(eqAddLUT + (maskEq&0xff))),
+					vld1_u8((uint8_t*)(eqAddLUT + ((maskEq>>8)&0xff)))
+				)
+			);
+			
+			// handle \r\n. sequences
+			// RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
+			// find instances of \r\n
+			uint8x16_t tmpData1, tmpData2, tmpData3, tmpData4;
+			uint8x16_t nextData = vld1q_u8(src + sizeof(uint8x16_t));
+			tmpData1 = vextq_u8(data, nextData, 1);
+			tmpData2 = vextq_u8(data, nextData, 2);
+			tmpData3 = vextq_u8(data, nextData, 3);
+			tmpData4 = vextq_u8(data, nextData, 4);
+			uint8x16_t matchNl1 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(data), vdupq_n_u16(0x0a0d)));
+			uint8x16_t matchNl2 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData1), vdupq_n_u16(0x0a0d)));
+			
+			uint8x16_t matchDots, matchNlDots;
+			uint16_t killDots;
+			matchDots = vceqq_u8(tmpData2, vdupq_n_u8('.'));
+			// merge preparation (for non-raw, it doesn't matter if this is shifted or not)
+			matchNl1 = vextq_u8(matchNl1, vdupq_n_u8(0), 1);
+			
+			// merge matches of \r\n with those for .
+			matchNlDots = vandq_u8(matchDots, vorrq_u8(matchNl1, matchNl2));
+			killDots = neon_movemask(matchNlDots);
+
+			uint8x16_t cmpB1 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData2), vdupq_n_u16(0x793d))); // "=y"
+			uint8x16_t cmpB2 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData3), vdupq_n_u16(0x793d)));
+			if(killDots) {
+				// match instances of \r\n.\r\n and \r\n.=y
+				uint8x16_t cmpC1 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData3), vdupq_n_u16(0x0a0d)));
+				uint8x16_t cmpC2 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData4), vdupq_n_u16(0x0a0d)));
+				cmpC1 = vorrq_u8(cmpC1, cmpB2);
+				cmpC2 = vorrq_u8(cmpC2, vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData4), vdupq_n_u16(0x793d))));
+				cmpC2 = vextq_u8(vdupq_n_u8(0), cmpC2, 15);
+				cmpC1 = vorrq_u8(cmpC1, cmpC2);
+				
+				// and w/ dots
+				cmpC1 = vandq_u8(cmpC1, matchNlDots);
+				// then merge w/ cmpB
+				cmpB1 = vandq_u8(cmpB1, matchNl1);
+				cmpB2 = vandq_u8(cmpB2, matchNl2);
+				
+				cmpB1 = vorrq_u8(cmpC1, vorrq_u8(
+					cmpB1, cmpB2
+				));
+			} else {
+				cmpB1 = vorrq_u8(
+					vandq_u8(cmpB1, matchNl1),
+					vandq_u8(cmpB2, matchNl2)
+				);
+			}
+#ifdef __aarch64__
+			if(vget_lane_u64(vqmovn_u64(vreinterpretq_u64_u8(cmpB1)), 0))
+#else
+			uint32x4_t tmp1 = vreinterpretq_u32_u8(cmpB1);
+			uint32x2_t tmp2 = vorr_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
+			if(vget_lane_u32(vpmax_u32(tmp2, tmp2), 0))
+#endif
+			{
+				// terminator found
+				// there's probably faster ways to do this, but reverting to scalar code should be good enough
+				escFirst = oldEscFirst;
+				dLen += dI;
+				return;
+			}
+			mask |= (killDots << 2) & 0xffff;
+			nextMask = killDots >> (sizeof(uint8x16_t)-2);
+
+			// all that's left is to 'compress' the data (skip over masked chars)
+			unsigned char skipped = BitsSetTable256[mask & 0xff];
+			// lookup compress masks and shuffle
+			oData = vcombine_u8(
+				vtbl1_u8(vget_low_u8(oData),  vld1_u8((uint8_t*)(unshufLUT + (mask&0xff)))),
+				vtbl1_u8(vget_high_u8(oData), vld1_u8((uint8_t*)(unshufLUT + (mask>>8))))
+			);
+			// compact down
+			uint8x16_t compact = vld1q_u8(pshufb_combine_table + skipped*sizeof(uint8x16_t));
+#ifdef __aarch64__
+			oData = vqtbl1q_u8(oData, compact);
+#else
+			uint8x8x2_t dataH = {vget_low_u8(oData), vget_high_u8(oData)};
+			oData = vcombine_u8(vtbl2_u8(dataH, vget_low_u8(compact)),
+								vtbl2_u8(dataH, vget_high_u8(compact)));
+#endif
+			vst1q_u8(p, oData);
+			
+			// increment output position
+			p += sizeof(uint8x16_t) - skipped - BitsSetTable256[mask >> 8];
+			
+		} else {
+			vst1q_u8(p, oData);
+			p += sizeof(uint8x16_t);
+			escFirst = 0;
+			nextMask = 0;
+		}
+	}
+}
+};
+#endif
+
+void decoder_init() {
+#ifdef __SSE2__
+	for(int i=0; i<256; i++) {
+		int k = i;
+		uint8_t res[8];
+		int p = 0;
+		
+		// fix LUT
+		k = i;
+		p = 0;
+		for(int j=0; j<8; j++) {
+			k = i >> j;
+			if(k & 1) {
+				p |= 1 << j;
+				j++;
+			}
+		}
+		eqFixLUT[i] = p;
+		
+		// sub LUT
+		k = i;
+		for(int j=0; j<8; j++) {
+			res[j] = (k & 1) ? 192 /* == -64 */ : 0;
+			k >>= 1;
+		}
+		_mm_storel_epi64((__m128i*)(eqAddLUT + i), _mm_loadl_epi64((__m128i*)res));
+	}
+#endif
+
+#ifdef __SSSE3__
+	// generate unshuf LUT
+	for(int i=0; i<256; i++) {
+		int k = i;
+		uint8_t res[8];
+		int p = 0;
+		for(int j=0; j<8; j++) {
+			if(!(k & 1)) {
+				res[p++] = j;
+			}
+			k >>= 1;
+		}
+		for(; p<8; p++)
+			res[p] = 0;
+		_mm_storel_epi64((__m128i*)(unshufLUT + i), _mm_loadl_epi64((__m128i*)res));
+	}
+#endif
+
+#ifdef __ARM_NEON
+	for(int i=0; i<256; i++) {
+		int k = i;
+		uint8_t res[8];
+		int p = 0;
+		
+		// fix LUT
+		k = i;
+		p = 0;
+		for(int j=0; j<8; j++) {
+			k = i >> j;
+			if(k & 1) {
+				p |= 1 << j;
+				j++;
+			}
+		}
+		eqFixLUT[i] = p;
+		
+		// sub LUT
+		k = i;
+		for(int j=0; j<8; j++) {
+			res[j] = (k & 1) ? 192 /* == -64 */ : 0;
+			k >>= 1;
+		}
+		vst1_u8((uint8_t*)(eqAddLUT + i), vld1_u8(res));
+		
+		k = i;
+		p = 0;
+		for(int j=0; j<8; j++) {
+			if(!(k & 1)) {
+				res[p++] = j;
+			}
+			k >>= 1;
+		}
+		for(; p<8; p++)
+			res[p] = 0;
+		vst1_u8((uint8_t*)(unshufLUT + i), vld1_u8(res));
+	}
+#endif
+}
+#endif
diff --git a/lib/yencode/SimdInit.cpp b/lib/yencode/SimdInit.cpp
index 3d7c8140..aaae50cb 100644
--- a/lib/yencode/SimdInit.cpp
+++ b/lib/yencode/SimdInit.cpp
@@ -31,28 +31,19 @@
 namespace YEncode
 {
 
-size_t (*decode)(const unsigned char*, unsigned char*, size_t, char* state) = nullptr;
+int (*decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = nullptr;
+extern void init_decode_scalar();
 bool decode_simd = false;
 
 void (*crc_init)(crc_state *const s) = nullptr;
 void (*crc_incr)(crc_state *const s, const unsigned char *src, long len) = nullptr;
 uint32_t (*crc_finish)(crc_state *const s) = nullptr;
+extern void init_crc_slice();
 bool crc_simd = false;
 
-void crc_slice_init(crc_state *const s);
-void crc_slice(crc_state *const s, const unsigned char *src, long len);
-uint32_t crc_slice_finish(crc_state *const s);
-
 #if defined(__i686__) || defined(__amd64__)
-size_t (*decode_sse2)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr;
 extern void init_decode_sse2();
-
-size_t (*decode_ssse3)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr;
 extern void init_decode_ssse3();
-
-void (*crc_init_pclmul)(crc_state *const s) = nullptr;
-void (*crc_incr_pclmul)(crc_state *const s, const unsigned char *src, long len) = nullptr;
-uint32_t (*crc_finish_pclmul)(crc_state *const s) = nullptr;
 extern void init_crc_pclmul();
 
 class CpuId
@@ -75,21 +66,14 @@ public:
 #endif
 
 #if defined(__arm__) || defined(__aarch64__)
-size_t (*decode_neon)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr;
 extern void init_decode_neon();
-
-void (*crc_init_acle)(crc_state *const s) = nullptr;
-void (*crc_incr_acle)(crc_state *const s, const unsigned char *src, long len) = nullptr;
-uint32_t (*crc_finish_acle)(crc_state *const s) = nullptr;
 extern void init_crc_acle();
 #endif
 
 void init()
 {
-	decode = &decode_scalar;
-	crc_init = &crc_slice_init;
-	crc_incr = &crc_slice;
-	crc_finish = &crc_slice_finish;
+	init_decode_scalar();
+	init_crc_slice();
 
 #if defined(__i686__) || defined(__amd64__)
 	CpuId cpuid(1);
@@ -102,31 +86,14 @@ void init()
 	if (cpu_supports_sse2)
 	{
 		init_decode_sse2();
-		if (decode_sse2)
-		{
-			decode = decode_sse2;
-			decode_simd = true;
-		}
 	}
 	if (cpu_supports_ssse3)
 	{
 		init_decode_ssse3();
-		if (decode_ssse3)
-		{
-			decode = decode_ssse3;
-			decode_simd = true;
-		}
 	}
 	if (cpu_supports_sse41 && cpu_supports_pclmul)
 	{
 		init_crc_pclmul();
-		if (crc_init_pclmul && crc_incr_pclmul && crc_finish_pclmul)
-		{
-			crc_init = crc_init_pclmul;
-			crc_incr = crc_incr_pclmul;
-			crc_finish = crc_finish_pclmul;
-			crc_simd = true;
-		}
 	}
 #endif
 
@@ -151,22 +118,10 @@ void init()
 	if (cpu_supports_neon)
 	{
 		init_decode_neon();
-		if (decode_neon)
-		{
-			decode = decode_neon;
-			decode_simd = true;
-		}
 	}
 	if (cpu_supports_crc)
 	{
 		init_crc_acle();
-		if (crc_init_acle && crc_incr_acle && crc_finish_acle)
-		{
-			crc_init = crc_init_acle;
-			crc_incr = crc_incr_acle;
-			crc_finish = crc_finish_acle;
-			crc_simd = true;
-		}
 	}
 #endif
 }
diff --git a/lib/yencode/SliceCrc.cpp b/lib/yencode/SliceCrc.cpp
index fc35426c..a9684d49 100644
--- a/lib/yencode/SliceCrc.cpp
+++ b/lib/yencode/SliceCrc.cpp
@@ -206,4 +206,11 @@ uint32_t crc_slice_finish(crc_state *const s)
 	return ~s->crc0[0];
 }
 
+void init_crc_slice()
+{
+	crc_init = &crc_slice_init;
+	crc_incr = &crc_slice;
+	crc_finish = &crc_slice_finish;
+}
+
 }
diff --git a/lib/yencode/Sse2Decoder.cpp b/lib/yencode/Sse2Decoder.cpp
index a3c47292..0f0971c8 100644
--- a/lib/yencode/Sse2Decoder.cpp
+++ b/lib/yencode/Sse2Decoder.cpp
@@ -30,200 +30,20 @@
 
 namespace YEncode
 {
+
+namespace Sse2
+{
 #ifdef __SSE2__
-
-// combine two 8-bit ints into a 16-bit one
-#define UINT16_PACK(a, b) ((a) | ((b) << 8))
-
-#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/
-
-#define STOREU_XMM(dest, xmm) \
-  _mm_storeu_si128((__m128i*)(dest), xmm)
-
-#define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \
-	_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \
-	(b) \
-))
-
-uint8_t eqFixLUT[256];
-alignas(32) __m64 eqAddLUT[256];
-
-size_t do_decode_sse2(const unsigned char* src, unsigned char* dest, size_t len, char* state) {
-	if(len <= sizeof(__m128i)*2) return decode_scalar(src, dest, len, state);
-	
-	unsigned char *p = dest; // destination pointer
-	unsigned long i = 0; // input position
-	unsigned char escFirst = 0; // input character; first char needs escaping
-	unsigned int nextMask = 0;
-	char tState = 0;
-	char* pState = state ? state : &tState;
-	if((uintptr_t)src & ((sizeof(__m128i)-1))) {
-		// find source memory alignment
-		unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(__m128i)-1)) & ~(sizeof(__m128i)-1));
-		
-		i = (unsigned long)(aSrc - src);
-		p += decode_scalar(src, dest, i, pState);
-	}
-	
-	if(*pState == 0 && i+1 < len && src[i] == '.')
-		nextMask = 1;
-	else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.'))
-		nextMask = 2;
-
-	escFirst = *pState == 1;
-	
-	if(i + (sizeof(__m128i)+1) < len) {
-		// our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
-		size_t dLen = len - (sizeof(__m128i)+1);
-		dLen = ((dLen-i) + 0xf) & ~0xf;
-		unsigned char* dSrc = (unsigned char*)src + dLen + i;
-		long dI = -(long)dLen;
-		i += dLen;
-		
-		for(; dI; dI += sizeof(__m128i)) {
-			__m128i data = _mm_load_si128((__m128i *)(dSrc + dI));
-			
-			// search for special chars
-			__m128i cmpEq = _mm_cmpeq_epi8(data, _mm_set1_epi8('=')),
-			cmp = _mm_or_si128(
-				_mm_or_si128(
-					_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), // \r\n
-					_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a))  // \n\r
-				),
-				cmpEq
-			);
-
-			unsigned int mask = _mm_movemask_epi8(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later
-			
-			__m128i oData;
-			if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers?
-				// first byte needs escaping due to preceeding = in last loop iteration
-				oData = _mm_sub_epi8(data, _mm_set_epi8(42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42+64));
-			} else {
-				oData = _mm_sub_epi8(data, _mm_set1_epi8(42));
-			}
-			mask &= ~escFirst;
-			mask |= nextMask;
-			
-			if (mask != 0) {
-				// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
-				// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
-
-				// firstly, resolve invalid sequences of = to deal with cases like '===='
-				unsigned int maskEq = _mm_movemask_epi8(cmpEq);
-				unsigned int tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
-				maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp;
-				
-				escFirst = (maskEq >> (sizeof(__m128i)-1));
-				// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
-				maskEq <<= 1;
-				mask &= ~maskEq;
-				
-				// unescape chars following `=`
-				oData = _mm_add_epi8(
-					oData,
-					LOAD_HALVES(
-						eqAddLUT + (maskEq&0xff),
-						eqAddLUT + ((maskEq>>8)&0xff)
-					)
-				);
-
-				// handle \r\n. sequences
-				// RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
-				// find instances of \r\n
-				__m128i tmpData1, tmpData2;
-				tmpData1 = _mm_insert_epi16(_mm_srli_si128(data, 1), *(uint16_t*)(dSrc + dI + sizeof(__m128i)-1), 7);
-				tmpData2 = _mm_insert_epi16(_mm_srli_si128(data, 2), *(uint16_t*)(dSrc + dI + sizeof(__m128i)), 7);
-				__m128i cmp1 = _mm_cmpeq_epi16(data, _mm_set1_epi16(0x0a0d));
-				__m128i cmp2 = _mm_cmpeq_epi16(tmpData1, _mm_set1_epi16(0x0a0d));
-				// prepare to merge the two comparisons
-				cmp1 = _mm_srli_si128(cmp1, 1);
-				// find all instances of .
-				tmpData2 = _mm_cmpeq_epi8(tmpData2, _mm_set1_epi8('.'));
-				// merge matches of \r\n with those for .
-				unsigned int killDots = _mm_movemask_epi8(
-					_mm_and_si128(tmpData2, _mm_or_si128(cmp1, cmp2))
-				);
-				mask |= (killDots << 2) & 0xffff;
-				nextMask = killDots >> (sizeof(__m128i)-2);
-
-				// all that's left is to 'compress' the data (skip over masked chars)
-				alignas(32) uint32_t mmTmp[4];
-				_mm_store_si128((__m128i*)mmTmp, oData);
-				
-				for(int j=0; j<4; j++) {
-					if(mask & 0xf) {
-						unsigned char* pMmTmp = (unsigned char*)(mmTmp + j);
-						unsigned int maskn = ~mask;
-						*p = pMmTmp[0];
-						p += (maskn & 1);
-						*p = pMmTmp[1];
-						p += (maskn & 2) >> 1;
-						*p = pMmTmp[2];
-						p += (maskn & 4) >> 2;
-						*p = pMmTmp[3];
-						p += (maskn & 8) >> 3;
-					} else {
-						*(uint32_t*)p = mmTmp[j];
-						p += 4;
-					}
-					mask >>= 4;
-				}
-			} else {
-				STOREU_XMM(p, oData);
-				p += XMM_SIZE;
-				escFirst = 0;
-				nextMask = 0;
-			}
-		}
-		
-		if(escFirst) *pState = 1; // escape next character
-		else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n
-		else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r
-		else *pState = 3;
-	}
-	
-	// end alignment
-	if(i < len) {
-		p += decode_scalar(src + i, p, len - i, pState);
-	}
-	
-	return p - dest;
-}
-
-extern size_t (*decode_sse2)(const unsigned char* src, unsigned char* dest, size_t len, char* state);
+#define SIMD_DECODER
+#include "SimdDecoder.cpp"
 #endif
+}
 
 void init_decode_sse2() {
 #ifdef __SSE2__
-	decode_sse2 = &do_decode_sse2;
-
-	// generate unshuf LUT
-	for(int i=0; i<256; i++) {
-		int k = i;
-		uint8_t res[8];
-		int p = 0;
-		
-		// fix LUT
-		k = i;
-		p = 0;
-		for(int j=0; j<8; j++) {
-			k = i >> j;
-			if(k & 1) {
-				p |= 1 << j;
-				j++;
-			}
-		}
-		eqFixLUT[i] = p;
-		
-		// sub LUT
-		k = i;
-		for(int j=0; j<8; j++) {
-			res[j] = (k & 1) ? 192 /* == -64 */ : 0;
-			k >>= 1;
-		}
-		_mm_storel_epi64((__m128i*)(eqAddLUT + i), _mm_loadl_epi64((__m128i*)res));
-	}
+	decode = &YEncode::Sse2::do_decode_simd<sizeof(__m128i), YEncode::Sse2::do_decode_sse<false>>;
+	YEncode::Sse2::decoder_init();
+	decode_simd = true;
 #endif
 }
 
diff --git a/lib/yencode/Ssse3Decoder.cpp b/lib/yencode/Ssse3Decoder.cpp
index 15a971ee..aeb7b82b 100644
--- a/lib/yencode/Ssse3Decoder.cpp
+++ b/lib/yencode/Ssse3Decoder.cpp
@@ -30,213 +30,20 @@
 
 namespace YEncode
 {
-#ifdef __SSSE3__
 
-// combine two 8-bit ints into a 16-bit one
-#define UINT16_PACK(a, b) ((a) | ((b) << 8))
-
-#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/
-
-#define STOREU_XMM(dest, xmm) \
-  _mm_storeu_si128((__m128i*)(dest), xmm)
-
-#define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \
-	_mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \
-	(b) \
-))
-
-// table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
-static const unsigned char BitsSetTable256[256] = 
+namespace Ssse3
 {
-#   define B2(n) n,     n+1,     n+1,     n+2
-#   define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2)
-#   define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2)
-    B6(0), B6(1), B6(1), B6(2)
-#undef B2
-#undef B4
-#undef B6
-};
-
-extern uint8_t eqFixLUT[256];
-extern __m64 eqAddLUT[256];
-
-alignas(32)__m64 unshufLUT[256];
-alignas(32) static const uint8_t _pshufb_combine_table[272] = {
-	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
-	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,
-	0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,
-	0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,
-	0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,
-	0x00,0x01,0x02,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,
-	0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,
-	0x00,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
-	0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
-};
-static const __m128i* pshufb_combine_table = (const __m128i*)_pshufb_combine_table;
-
-size_t do_decode_ssse3(const unsigned char* src, unsigned char* dest, size_t len, char* state) {
-	if(len <= sizeof(__m128i)*2) return decode_scalar(src, dest, len, state);
-	
-	unsigned char *p = dest; // destination pointer
-	unsigned long i = 0; // input position
-	unsigned char escFirst = 0; // input character; first char needs escaping
-	unsigned int nextMask = 0;
-	char tState = 0;
-	char* pState = state ? state : &tState;
-	if((uintptr_t)src & ((sizeof(__m128i)-1))) {
-		// find source memory alignment
-		unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(__m128i)-1)) & ~(sizeof(__m128i)-1));
-		
-		i = (unsigned long)(aSrc - src);
-		p += decode_scalar(src, dest, i, pState);
-	}
-	
-	// handle finicky case of \r\n. straddled across initial boundary
-	if(*pState == 0 && i+1 < len && src[i] == '.')
-		nextMask = 1;
-	else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.'))
-		nextMask = 2;
-
-	escFirst = *pState == 1;
-	
-	if(i + (sizeof(__m128i)+1) < len) {
-		// our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking)
-		size_t dLen = len - (sizeof(__m128i)+1);
-		dLen = ((dLen-i) + 0xf) & ~0xf;
-		unsigned char* dSrc = (unsigned char*)src + dLen + i;
-		long dI = -(long)dLen;
-		i += dLen;
-		
-		for(; dI; dI += sizeof(__m128i)) {
-			__m128i data = _mm_load_si128((__m128i *)(dSrc + dI));
-			
-			// search for special chars
-			__m128i cmpEq = _mm_cmpeq_epi8(data, _mm_set1_epi8('=')),
-			cmp = _mm_or_si128(
-				_mm_or_si128(
-					_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), // \r\n
-					_mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a))  // \n\r
-				),
-				cmpEq
-			);
-
-			unsigned int mask = _mm_movemask_epi8(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later
-			
-			__m128i oData;
-			if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers?
-				// first byte needs escaping due to preceeding = in last loop iteration
-				oData = _mm_sub_epi8(data, _mm_set_epi8(42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42+64));
-			} else {
-				oData = _mm_sub_epi8(data, _mm_set1_epi8(42));
-			}
-			mask &= ~escFirst;
-			mask |= nextMask;
-			
-			if (mask != 0) {
-				// a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant
-				// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
-				
-				// firstly, resolve invalid sequences of = to deal with cases like '===='
-				unsigned int maskEq = _mm_movemask_epi8(cmpEq);
-				unsigned int tmp = eqFixLUT[(maskEq&0xff) & ~escFirst];
-				maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp;
-				
-				escFirst = (maskEq >> (sizeof(__m128i)-1));
-				// next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed
-				maskEq <<= 1;
-				mask &= ~maskEq;
-				
-				// unescape chars following `=`
-				oData = _mm_add_epi8(
-					oData,
-					LOAD_HALVES(
-						eqAddLUT + (maskEq&0xff),
-						eqAddLUT + ((maskEq>>8)&0xff)
-					)
-				);
-
-				// handle \r\n. sequences
-				// RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing
-				// find instances of \r\n
-				__m128i tmpData1, tmpData2;
-				__m128i nextData = _mm_load_si128((__m128i *)(dSrc + dI) + 1);
-				tmpData1 = _mm_alignr_epi8(nextData, data, 1);
-				tmpData2 = _mm_alignr_epi8(nextData, data, 2);
-				__m128i cmp1 = _mm_cmpeq_epi16(data, _mm_set1_epi16(0x0a0d));
-				__m128i cmp2 = _mm_cmpeq_epi16(tmpData1, _mm_set1_epi16(0x0a0d));
-				// prepare to merge the two comparisons
-				cmp1 = _mm_srli_si128(cmp1, 1);
-				// find all instances of .
-				tmpData2 = _mm_cmpeq_epi8(tmpData2, _mm_set1_epi8('.'));
-				// merge matches of \r\n with those for .
-				unsigned int killDots = _mm_movemask_epi8(
-					_mm_and_si128(tmpData2, _mm_or_si128(cmp1, cmp2))
-				);
-				mask |= (killDots << 2) & 0xffff;
-				nextMask = killDots >> (sizeof(__m128i)-2);
-
-				// all that's left is to 'compress' the data (skip over masked chars)
-				unsigned char skipped = BitsSetTable256[mask & 0xff];
-				// lookup compress masks and shuffle
-				// load up two halves
-				__m128i shuf = LOAD_HALVES(unshufLUT + (mask&0xff), unshufLUT + (mask>>8));
-				
-				// offset upper half by 8
-				shuf = _mm_add_epi8(shuf, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
-				// shift down upper half into lower
-				// TODO: consider using `mask & 0xff` in table instead of counting bits
-				shuf = _mm_shuffle_epi8(shuf, _mm_load_si128(pshufb_combine_table + skipped));
-				
-				// shuffle data
-				oData = _mm_shuffle_epi8(oData, shuf);
-				STOREU_XMM(p, oData);
-				
-				// increment output position
-				p += XMM_SIZE - skipped - BitsSetTable256[mask >> 8];
-			} else {
-				STOREU_XMM(p, oData);
-				p += XMM_SIZE;
-				escFirst = 0;
-				nextMask = 0;
-			}
-		}
-		
-		if(escFirst) *pState = 1; // escape next character
-		else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n
-		else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r
-		else *pState = 3;
-	}
-	
-	// end alignment
-	if(i < len) {
-		p += decode_scalar(src + i, p, len - i, pState);
-	}
-	
-	return p - dest;
-}
-
-extern size_t (*decode_ssse3)(const unsigned char* src, unsigned char* dest, size_t len, char* state);
+#ifdef __SSSE3__
+#define SIMD_DECODER
+#include "SimdDecoder.cpp"
 #endif
+}
 
 void init_decode_ssse3() {
 #ifdef __SSSE3__
-	decode_ssse3 = do_decode_ssse3;
-
-	// generate unshuf LUT
-	for(int i=0; i<256; i++) {
-		int k = i;
-		uint8_t res[8];
-		int p = 0;
-		for(int j=0; j<8; j++) {
-			if(!(k & 1)) {
-				res[p++] = j;
-			}
-			k >>= 1;
-		}
-		for(; p<8; p++)
-			res[p] = 0;
-		_mm_storel_epi64((__m128i*)(unshufLUT + i), _mm_loadl_epi64((__m128i*)res));
-	}
+	decode = &YEncode::Ssse3::do_decode_simd<sizeof(__m128i), YEncode::Ssse3::do_decode_sse<true>>;
+	YEncode::Ssse3::decoder_init();
+	decode_simd = true;
 #endif
 }
 
diff --git a/lib/yencode/YEncode.h b/lib/yencode/YEncode.h
index 7b4c9538..757c953d 100644
--- a/lib/yencode/YEncode.h
+++ b/lib/yencode/YEncode.h
@@ -27,8 +27,19 @@ namespace YEncode
 {
 
 void init();
-extern size_t (*decode)(const unsigned char* inbuf, unsigned char* outbuf, size_t, char* state);
-size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len, char* state);
+
+typedef enum : char {
+	YDEC_STATE_CRLF, // default
+	YDEC_STATE_EQ,
+	YDEC_STATE_CR,
+	YDEC_STATE_NONE,
+	YDEC_STATE_CRLFDT,
+	YDEC_STATE_CRLFDTCR,
+	YDEC_STATE_CRLFEQ // may actually be "\r\n.=" in raw decoder
+} YencDecoderState;
+
+extern int (*decode)(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state);
+extern int decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state);
 extern bool decode_simd;
 
 struct crc_state
diff --git a/nzbget.vcxproj b/nzbget.vcxproj
index 72d9a6b6..9f8517de 100755
--- a/nzbget.vcxproj
+++ b/nzbget.vcxproj
@@ -280,6 +280,7 @@
     <ClCompile Include="lib\yencode\ScalarDecoder.cpp" />
     <ClCompile Include="lib\yencode\Sse2Decoder.cpp" />
     <ClCompile Include="lib\yencode\Ssse3Decoder.cpp" />
+    <ClCompile Include="lib\yencode\SliceCrc.cpp" />
     <ClCompile Include="lib\yencode\PclmulCrc.cpp" />
   </ItemGroup>
   <ItemGroup>
@@ -382,6 +383,7 @@
     <ClInclude Include="lib\par2\verificationhashtable.h" />
     <ClInclude Include="lib\par2\verificationpacket.h" />
     <ClInclude Include="lib\yencode\YEncode.h" />
+    <ClInclude Include="lib\yencode\SimdDecoder.cpp" />
     <ClInclude Include="windows\resources\resource.h" />
   </ItemGroup>
   <ItemGroup>