diff --git a/Makefile.am b/Makefile.am index 77cc7c4f..17f8e38f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -218,6 +218,7 @@ endif nzbget_SOURCES += \ lib/yencode/YEncode.h \ lib/yencode/SimdInit.cpp \ + lib/yencode/SimdDecoder.cpp \ lib/yencode/ScalarDecoder.cpp \ lib/yencode/Sse2Decoder.cpp \ lib/yencode/Ssse3Decoder.cpp \ diff --git a/Makefile.in b/Makefile.in index ba6606be..f85fdac2 100644 --- a/Makefile.in +++ b/Makefile.in @@ -313,13 +313,13 @@ am__nzbget_SOURCES_DIST = daemon/connect/Connection.cpp \ lib/par2/verificationhashtable.h \ lib/par2/verificationpacket.cpp lib/par2/verificationpacket.h \ lib/yencode/YEncode.h lib/yencode/SimdInit.cpp \ - lib/yencode/ScalarDecoder.cpp lib/yencode/Sse2Decoder.cpp \ - lib/yencode/Ssse3Decoder.cpp lib/yencode/PclmulCrc.cpp \ - lib/yencode/NeonDecoder.cpp lib/yencode/AcleCrc.cpp \ - lib/yencode/SliceCrc.cpp lib/catch/catch.h \ - tests/suite/TestMain.cpp tests/suite/TestMain.h \ - tests/suite/TestUtil.cpp tests/suite/TestUtil.h \ - tests/main/CommandLineParserTest.cpp \ + lib/yencode/SimdDecoder.cpp lib/yencode/ScalarDecoder.cpp \ + lib/yencode/Sse2Decoder.cpp lib/yencode/Ssse3Decoder.cpp \ + lib/yencode/PclmulCrc.cpp lib/yencode/NeonDecoder.cpp \ + lib/yencode/AcleCrc.cpp lib/yencode/SliceCrc.cpp \ + lib/catch/catch.h tests/suite/TestMain.cpp \ + tests/suite/TestMain.h tests/suite/TestUtil.cpp \ + tests/suite/TestUtil.h tests/main/CommandLineParserTest.cpp \ tests/main/OptionsTest.cpp tests/feed/FeedFilterTest.cpp \ tests/postprocess/DupeMatcherTest.cpp \ tests/postprocess/RarRenamerTest.cpp \ @@ -431,6 +431,7 @@ am_nzbget_OBJECTS = daemon/connect/Connection.$(OBJEXT) \ daemon/nserv/NzbGenerator.$(OBJEXT) \ daemon/nserv/YEncoder.$(OBJEXT) code_revision.$(OBJEXT) \ $(am__objects_1) lib/yencode/SimdInit.$(OBJEXT) \ + lib/yencode/SimdDecoder.$(OBJEXT) \ lib/yencode/ScalarDecoder.$(OBJEXT) \ lib/yencode/Sse2Decoder.$(OBJEXT) \ lib/yencode/Ssse3Decoder.$(OBJEXT) \ @@ -780,11 +781,11 @@ nzbget_SOURCES = daemon/connect/Connection.cpp \ daemon/nserv/NzbGenerator.h daemon/nserv/NzbGenerator.cpp \ daemon/nserv/YEncoder.h daemon/nserv/YEncoder.cpp \ code_revision.cpp $(am__append_1) lib/yencode/YEncode.h \ - lib/yencode/SimdInit.cpp lib/yencode/ScalarDecoder.cpp \ - lib/yencode/Sse2Decoder.cpp lib/yencode/Ssse3Decoder.cpp \ - lib/yencode/PclmulCrc.cpp lib/yencode/NeonDecoder.cpp \ - lib/yencode/AcleCrc.cpp lib/yencode/SliceCrc.cpp \ - $(am__append_2) $(am__append_3) + lib/yencode/SimdInit.cpp lib/yencode/SimdDecoder.cpp \ + lib/yencode/ScalarDecoder.cpp lib/yencode/Sse2Decoder.cpp \ + lib/yencode/Ssse3Decoder.cpp lib/yencode/PclmulCrc.cpp \ + lib/yencode/NeonDecoder.cpp lib/yencode/AcleCrc.cpp \ + lib/yencode/SliceCrc.cpp $(am__append_2) $(am__append_3) AM_CPPFLAGS = -I$(srcdir)/daemon/connect -I$(srcdir)/daemon/extension \ -I$(srcdir)/daemon/feed -I$(srcdir)/daemon/frontend \ -I$(srcdir)/daemon/main -I$(srcdir)/daemon/nntp \ @@ -1345,6 +1346,8 @@ lib/yencode/$(DEPDIR)/$(am__dirstamp): @: > lib/yencode/$(DEPDIR)/$(am__dirstamp) lib/yencode/SimdInit.$(OBJEXT): lib/yencode/$(am__dirstamp) \ lib/yencode/$(DEPDIR)/$(am__dirstamp) +lib/yencode/SimdDecoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \ + lib/yencode/$(DEPDIR)/$(am__dirstamp) lib/yencode/ScalarDecoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \ lib/yencode/$(DEPDIR)/$(am__dirstamp) lib/yencode/Sse2Decoder.$(OBJEXT): lib/yencode/$(am__dirstamp) \ @@ -1610,6 +1613,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/NeonDecoder.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/PclmulCrc.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/ScalarDecoder.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/SimdDecoder.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/SimdInit.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/SliceCrc.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@lib/yencode/$(DEPDIR)/Sse2Decoder.Po@am__quote@ diff --git a/daemon/main/nzbget.h b/daemon/main/nzbget.h index b93bbb80..2d96f016 100644 --- a/daemon/main/nzbget.h +++ b/daemon/main/nzbget.h @@ -406,7 +406,4 @@ template typename _Unique_if::_Unknown_bound make_unique(size_t n) { } #endif -/* Define to 1 to disable article decoding (for internal test purposes only). */ -//#define SKIP_ARTICLE_DECODING - #endif /* NZBGET_H */ diff --git a/daemon/nntp/Decoder.cpp b/daemon/nntp/Decoder.cpp index d8a2dd40..e72756f3 100644 --- a/daemon/nntp/Decoder.cpp +++ b/daemon/nntp/Decoder.cpp @@ -51,9 +51,6 @@ void Decoder::Clear() m_crcCheck = false; m_lineBuf.Reserve(1024*8); m_lineBuf.SetLength(0); - m_extraChar = '\0'; - m_lastChar1 = '\0'; - m_lastChar2 = '\0'; } /* At the beginning of article the processing goes line by line to find '=ybegin'-marker. @@ -230,104 +227,37 @@ void Decoder::ProcessYenc(char* buffer, int len) } } -// find end of yEnc-data or article data -char* Decoder::FindStreamEnd(char* buffer, int len) -{ - // 0: previous characters are '\r\n' OR there is no previous character - if (m_state == 0 && len > 1 && - ((buffer[0] == '=' && buffer[1] == 'y') || - (buffer[0] == '.' && buffer[1] == '\r'))) - { - return buffer; - } - // 1: previous character is '=' - if (m_state == 1 && buffer[0] == 'y') - { - m_extraChar = '='; - return buffer; - } - // 2: previous character is '\r' - if (m_state == 2 && len > 2 && buffer[0] == '\n' && - ((buffer[1] == '=' && buffer[2] == 'y') || - (buffer[1] == '.' && buffer[2] == '\r'))) - { - return buffer + 1; - } - - // previous characters are '\n.' - if (m_lastChar2 == '\n' && m_lastChar1 == '.' && buffer[0] == '\r') - { - m_extraChar = '.'; - return buffer; - } - - char* last = buffer + len - 1; - char* line = buffer; - int llen = len; - while (char* end = (char*)memchr(line, '\n', llen)) - { - if (end + 2 <= last && - ((end[1] == '=' && end[2] == 'y') || - (end[1] == '.' && end[2] == '\r'))) - { - return end + 1; - } - llen -= (int)(end - line) + 1; - line = end + 1; - } - - // save last two characters for future use - m_lastChar1 = buffer[len - 1]; - if (len > 1) - { - m_lastChar2 = buffer[len - 2]; - } - - return nullptr; -} - int Decoder::DecodeYenc(char* buffer, char* outbuf, int len) { - int inpLen = len; - char* end = FindStreamEnd(buffer, len); - if (end) - { - len = (int)(end - buffer); - } + const unsigned char* src = (unsigned char*)buffer; + unsigned char* dst = (unsigned char*)outbuf; -#ifdef SKIP_ARTICLE_DECODING - m_state = m_lastChar2 == '\r' && m_lastChar1 == '\n' ? 0 : - m_lastChar1 == '=' ? 1 : m_lastChar1 == '\r' ? 2 : 3; -#else - len = (int)YEncode::decode((const uchar*)buffer, (uchar*)outbuf, len, &m_state); -#endif + int endseq = YEncode::decode(&src, &dst, len, (YEncode::YencDecoderState*)&m_state); + int outlen = (int)((char*)dst - outbuf); - if (end) + // endseq: + // 0: no end sequence found + // 1: \r\n=y sequence found, src points to byte after 'y' + // 2: \r\n.\r\n sequence found, src points to byte after last '\n' + if (endseq != 0) { // switch back to line mode to process '=yend'- or eof- marker m_lineBuf.SetLength(0); - if (m_extraChar) - { - m_lineBuf.Append(&m_extraChar, 1); - } - m_lineBuf.Append(end, inpLen - (int)(end - buffer)); + m_lineBuf.Append(endseq == 1 ? "=y" : ".\r\n"); + m_lineBuf.Append((const char*)src, len - (int)((const char*)src - buffer)); m_body = false; } if (m_crcCheck) { - m_crc32.Append((uchar*)outbuf, (uint32)len); + m_crc32.Append((uchar*)outbuf, (uint32)outlen); } - return len; + return outlen; } Decoder::EStatus Decoder::Check() { -#ifdef SKIP_ARTICLE_DECODING - return dsFinished; -#endif - switch (m_format) { case efYenc: diff --git a/daemon/nntp/Decoder.h b/daemon/nntp/Decoder.h index c0f1939e..ee5be418 100644 --- a/daemon/nntp/Decoder.h +++ b/daemon/nntp/Decoder.h @@ -58,11 +58,11 @@ public: bool GetEof() { return m_eof; } const char* GetArticleFilename() { return m_articleFilename; } -private: +private: EFormat m_format = efUnknown; bool m_begin; bool m_part; - bool m_body; + bool m_body; bool m_end; bool m_crc; uint32 m_expectedCRC; @@ -76,15 +76,11 @@ private: char m_state; CString m_articleFilename; StringBuilder m_lineBuf; - char m_extraChar; - char m_lastChar1; - char m_lastChar2; Crc32 m_crc32; EFormat DetectFormat(const char* buffer, int len); void ProcessYenc(char* buffer, int len); int DecodeYenc(char* buffer, char* outbuf, int len); - char* FindStreamEnd(char* buffer, int len); EStatus CheckYenc(); int DecodeUx(char* buffer, int len); EStatus CheckUx(); diff --git a/lib/yencode/AcleCrc.cpp b/lib/yencode/AcleCrc.cpp index 192a2163..d3508514 100644 --- a/lib/yencode/AcleCrc.cpp +++ b/lib/yencode/AcleCrc.cpp @@ -87,18 +87,14 @@ uint32_t crc_arm_finish(crc_state *const s) { return ~s->crc0[0]; } - -extern void (*crc_init_acle)(crc_state *const s); -extern void (*crc_incr_acle)(crc_state *const s, const unsigned char *src, long len); -extern uint32_t (*crc_finish_acle)(crc_state *const s); #endif void init_crc_acle() { #ifdef __ARM_FEATURE_CRC32 - crc_init_acle = &crc_arm_init; - crc_incr_acle = &crc_arm; - crc_finish_acle = &crc_arm_finish; + crc_init = &crc_arm_init; + crc_incr = &crc_arm; + crc_finish = &crc_arm_finish; #endif } diff --git a/lib/yencode/NeonDecoder.cpp b/lib/yencode/NeonDecoder.cpp index 14e008f5..a45b7ef0 100644 --- a/lib/yencode/NeonDecoder.cpp +++ b/lib/yencode/NeonDecoder.cpp @@ -30,242 +30,20 @@ namespace YEncode { -#ifdef __ARM_NEON -// combine two 8-bit ints into a 16-bit one -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define UINT16_PACK(a, b) ((a) | ((b) << 8)) -#else -#define UINT16_PACK(a, b) (((a) << 8) | (b)) -#endif - -// table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable -static const unsigned char BitsSetTable256[256] = +namespace Neon { -# define B2(n) n, n+1, n+1, n+2 -# define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2) -# define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2) - B6(0), B6(1), B6(1), B6(2) -#undef B2 -#undef B4 -#undef B6 -}; - -static uint16_t neon_movemask(uint8x16_t in) { - uint8x16_t mask = vandq_u8(in, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}); -# if defined(__aarch64__) && 0 - // TODO: is this better? - return (vaddv_u8(vget_high_u8(mask)) << 8) | vaddv_u8(vget_low_u8(mask)); -# else - uint8x8_t res = vpadd_u8(vget_low_u8(mask), vget_high_u8(mask)); - res = vpadd_u8(res, res); - res = vpadd_u8(res, res); - return vget_lane_u16(vreinterpret_u16_u8(res), 0); -# endif -} - -uint8_t eqFixLUT[256]; -alignas(32) uint8x8_t eqAddLUT[256]; -alignas(32) uint8x8_t unshufLUT[256]; -alignas(32) static const uint8_t pshufb_combine_table[272] = { - 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, - 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80, - 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80, - 0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80, - 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80, - 0x00,0x01,0x02,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80, - 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80, - 0x00,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80, - 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, -}; - -size_t do_decode_neon(const unsigned char* src, unsigned char* dest, size_t len, char* state) { - if(len <= sizeof(uint8x16_t)*2) return decode_scalar(src, dest, len, state); - - unsigned char *p = dest; // destination pointer - unsigned long i = 0; // input position - unsigned char escFirst = 0; // input character; first char needs escaping - unsigned int nextMask = 0; - char tState = 0; - char* pState = state ? state : &tState; - if((uintptr_t)src & ((sizeof(uint8x16_t)-1))) { - // find source memory alignment - unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(uint8x16_t)-1)) & ~(sizeof(uint8x16_t)-1)); - - i = aSrc - src; - p += decode_scalar(src, dest, i, pState); - } - - // handle finicky case of \r\n. straddled across initial boundary - if(*pState == 0 && i+1 < len && src[i] == '.') - nextMask = 1; - else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.')) - nextMask = 2; - - escFirst = *pState == 1; - - if(i + (sizeof(uint8x16_t)+1) < len) { - // our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking) - size_t dLen = len - (sizeof(uint8x16_t)+1); - dLen = ((dLen-i) + 0xf) & ~0xf; - uint8_t* dSrc = (uint8_t*)src + dLen + i; - long dI = -dLen; - i += dLen; - - for(; dI; dI += sizeof(uint8x16_t)) { - uint8x16_t data = vld1q_u8(dSrc + dI); - - // search for special chars - uint8x16_t cmpEq = vceqq_u8(data, vdupq_n_u8('=')), - cmp = vorrq_u8( - vorrq_u8( - vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0a0d))), // \r\n - vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0d0a))) // \n\r - ), - cmpEq - ); - uint16_t mask = neon_movemask(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later - - uint8x16_t oData; - if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers? - // first byte needs escaping due to preceeding = in last loop iteration - oData = vsubq_u8(data, (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42}); - } else { - oData = vsubq_u8(data, vdupq_n_u8(42)); - } - mask &= ~escFirst; - mask |= nextMask; - - if (mask != 0) { - // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant - // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that - - // firstly, resolve invalid sequences of = to deal with cases like '====' - uint16_t maskEq = neon_movemask(cmpEq); - uint16_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst]; - maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp; - - escFirst = (maskEq >> (sizeof(uint8x16_t)-1)); - // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed - maskEq <<= 1; - mask &= ~maskEq; - - // unescape chars following `=` - oData = vaddq_u8( - oData, - vcombine_u8( - vld1_u8((uint8_t*)(eqAddLUT + (maskEq&0xff))), - vld1_u8((uint8_t*)(eqAddLUT + ((maskEq>>8)&0xff))) - ) - ); - - // handle \r\n. sequences - // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing - // find instances of \r\n - uint8x16_t tmpData1, tmpData2; - uint8x16_t nextData = vld1q_u8(dSrc + dI + sizeof(uint8x16_t)); - tmpData1 = vextq_u8(data, nextData, 1); - tmpData2 = vextq_u8(data, nextData, 2); - uint8x16_t cmp1 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(data), vdupq_n_u16(0x0a0d))); - uint8x16_t cmp2 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData1), vdupq_n_u16(0x0a0d))); - // prepare to merge the two comparisons - cmp1 = vextq_u8(cmp1, vdupq_n_u8(0), 1); - // find all instances of . - tmpData2 = vceqq_u8(tmpData2, vdupq_n_u8('.')); - // merge matches of \r\n with those for . - uint16_t killDots = neon_movemask( - vandq_u8(tmpData2, vorrq_u8(cmp1, cmp2)) - ); - mask |= (killDots << 2) & 0xffff; - nextMask = killDots >> (sizeof(uint8x16_t)-2); - - // all that's left is to 'compress' the data (skip over masked chars) - unsigned char skipped = BitsSetTable256[mask & 0xff]; - // lookup compress masks and shuffle - oData = vcombine_u8( - vtbl1_u8(vget_low_u8(oData), vld1_u8((uint8_t*)(unshufLUT + (mask&0xff)))), - vtbl1_u8(vget_high_u8(oData), vld1_u8((uint8_t*)(unshufLUT + (mask>>8)))) - ); - // compact down - uint8x16_t compact = vld1q_u8(pshufb_combine_table + skipped*sizeof(uint8x16_t)); -# ifdef __aarch64__ - oData = vqtbl1q_u8(oData, compact); -# else - uint8x8x2_t dataH = {vget_low_u8(oData), vget_high_u8(oData)}; - oData = vcombine_u8(vtbl2_u8(dataH, vget_low_u8(compact)), - vtbl2_u8(dataH, vget_high_u8(compact))); -# endif - vst1q_u8(p, oData); - - // increment output position - p += sizeof(uint8x16_t) - skipped - BitsSetTable256[mask >> 8]; - - } else { - vst1q_u8(p, oData); - p += sizeof(uint8x16_t); - escFirst = 0; - nextMask = 0; - } - } - - if(escFirst) *pState = 1; // escape next character - else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n - else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r - else *pState = 3; - } - - // end alignment - if(i < len) { - p += decode_scalar(src + i, p, len - i, pState); - } - - return p - dest; -} - -extern size_t (*decode_neon)(const unsigned char* src, unsigned char* dest, size_t len, char* state); +#ifdef __ARM_NEON +#define SIMD_DECODER +#include "SimdDecoder.cpp" #endif +} void init_decode_neon() { #ifdef __ARM_NEON - decode_neon = &do_decode_neon; - - for(int i=0; i<256; i++) { - int k = i; - uint8_t res[8]; - int p = 0; - - // fix LUT - k = i; - p = 0; - for(int j=0; j<8; j++) { - k = i >> j; - if(k & 1) { - p |= 1 << j; - j++; - } - } - eqFixLUT[i] = p; - - // sub LUT - k = i; - for(int j=0; j<8; j++) { - res[j] = (k & 1) ? 192 /* == -64 */ : 0; - k >>= 1; - } - vst1_u8((uint8_t*)(eqAddLUT + i), vld1_u8(res)); - - k = i; - p = 0; - for(int j=0; j<8; j++) { - if(!(k & 1)) { - res[p++] = j; - } - k >>= 1; - } - for(; p<8; p++) - res[p] = 0; - vst1_u8((uint8_t*)(unshufLUT + i), vld1_u8(res)); - } + decode = &YEncode::Neon::do_decode_simd; + YEncode::Neon::decoder_init(); + decode_simd = true; #endif } diff --git a/lib/yencode/PclmulCrc.cpp b/lib/yencode/PclmulCrc.cpp index 39d40954..3b2ac32c 100644 --- a/lib/yencode/PclmulCrc.cpp +++ b/lib/yencode/PclmulCrc.cpp @@ -37,6 +37,8 @@ #include "nzbget.h" +#include "YEncode.h" + #ifdef __PCLMUL__ #include #endif @@ -45,11 +47,6 @@ namespace YEncode { #ifdef __PCLMUL__ -struct crc_state -{ - alignas(16) unsigned crc0[4 * 5]; -}; - void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4, @@ -459,18 +456,15 @@ uint32_t crc_fold_512to32(crc_state *const s) { return ~crc; CRC_SAVE(s) } - -extern void (*crc_init_pclmul)(crc_state *const s); -extern void (*crc_incr_pclmul)(crc_state *const s, const unsigned char *src, long len); -extern uint32_t (*crc_finish_pclmul)(crc_state *const s); #endif void init_crc_pclmul() { #ifdef __PCLMUL__ - crc_init_pclmul = &crc_fold_init; - crc_incr_pclmul = &crc_fold; - crc_finish_pclmul = &crc_fold_512to32; + crc_init = &crc_fold_init; + crc_incr = &crc_fold; + crc_finish = &crc_fold_512to32; + crc_simd = true; #endif } diff --git a/lib/yencode/ScalarDecoder.cpp b/lib/yencode/ScalarDecoder.cpp index e44ec134..9fbb98b1 100644 --- a/lib/yencode/ScalarDecoder.cpp +++ b/lib/yencode/ScalarDecoder.cpp @@ -22,8 +22,7 @@ #include "nzbget.h" -namespace YEncode -{ +#include "YEncode.h" // combine two 8-bit ints into a 16-bit one #if __BYTE_ORDER == __LITTLE_ENDIAN @@ -32,55 +31,135 @@ namespace YEncode #define UINT16_PACK(a, b) (((a) << 8) | (b)) #endif -// state var: refers to the previous state - only used for incremental processing -// 0: previous characters are `\r\n` OR there is no previous character -// 1: previous character is `=` -// 2: previous character is `\r` -// 3: previous character is none of the above -size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len, char* state) { - unsigned char *es = (unsigned char*)src + len; // end source pointer - unsigned char *p = dest; // destination pointer +namespace YEncode +{ + +// return values: +// - 0: no end sequence found +// - 1: \r\n=y sequence found, src points to byte after 'y' +// - 2: \r\n.\r\n sequence found, src points to byte after last '\n' +int decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) { + const unsigned char *es = (*src) + len; // end source pointer + unsigned char *p = *dest; // destination pointer long i = -(long)len; // input position unsigned char c; // input character - - if (len < 1) return 0; - - if (state) switch (*state) { - case 1: + + if(len < 1) return 0; + +#define YDEC_CHECK_END(s) if(i == 0) { \ + *state = s; \ + *src = es; \ + *dest = p; \ + return 0; \ +} + if(state) switch(*state) { + case YDEC_STATE_CRLFEQ: do_decode_endable_scalar_ceq: + if(es[i] == 'y') { + *state = YDEC_STATE_NONE; + *src = es+i+1; + *dest = p; + return 1; + } // else fall thru and escape + case YDEC_STATE_EQ: c = es[i]; *p++ = c - 42 - 64; i++; - if (c == '\r' && i < 0) { - *state = 2; - // fall through to case 2 - } - else { - *state = 3; - break; - } - case 2: - if (es[i] != '\n') break; + if(c != '\r') break; + YDEC_CHECK_END(YDEC_STATE_CR) + // fall through + case YDEC_STATE_CR: + if(es[i] != '\n') break; i++; - *state = 0; // now `\r\n` - if (i >= 0) return 0; - case 0: - // skip past first dot - if (es[i] == '.') i++; - } - else // treat as *state == 0 - if (es[i] == '.') i++; - - for (; i < -2; i++) { + YDEC_CHECK_END(YDEC_STATE_CRLF) + case YDEC_STATE_CRLF: do_decode_endable_scalar_c0: + if(es[i] == '.') { + i++; + YDEC_CHECK_END(YDEC_STATE_CRLFDT) + // fallthru + } else if(es[i] == '=') { + i++; + YDEC_CHECK_END(YDEC_STATE_CRLFEQ) + goto do_decode_endable_scalar_ceq; + } else + break; + case YDEC_STATE_CRLFDT: + if(es[i] == '\r') { + i++; + YDEC_CHECK_END(YDEC_STATE_CRLFDTCR) + // fallthru + } else if(es[i] == '=') { // check for dot-stuffed ending: \r\n.=y + i++; + YDEC_CHECK_END(YDEC_STATE_CRLFEQ) + goto do_decode_endable_scalar_ceq; + } else + break; + case YDEC_STATE_CRLFDTCR: + if(es[i] == '\n') { + *state = YDEC_STATE_CRLF; + *src = es + i + 1; + *dest = p; + return 2; + } else + break; + case YDEC_STATE_NONE: break; // silence compiler warning + } else // treat as YDEC_STATE_CRLF + goto do_decode_endable_scalar_c0; + + for(; i < -2; i++) { c = es[i]; - switch (c) { - case '\r': - // skip past \r\n. sequences - if (*(uint16_t*)(es + i + 1) == UINT16_PACK('\n', '.')) - i += 2; - case '\n': + switch(c) { + case '\r': { + uint16_t next = *(uint16_t*)(es + i + 1); + if(next == UINT16_PACK('\n', '.')) { + // skip past \r\n. sequences + i += 3; + YDEC_CHECK_END(YDEC_STATE_CRLFDT) + // check for end + if(es[i] == '\r') { + i++; + YDEC_CHECK_END(YDEC_STATE_CRLFDTCR) + if(es[i] == '\n') { + *src = es + i + 1; + *dest = p; + *state = YDEC_STATE_CRLF; + return 2; + } else i--; + } else if(es[i] == '=') { + i++; + YDEC_CHECK_END(YDEC_STATE_CRLFEQ) + if(es[i] == 'y') { + *src = es + i + 1; + *dest = p; + *state = YDEC_STATE_NONE; + return 1; + } else { + // escape char & continue + c = es[i]; + *p++ = c - 42 - 64; + i -= (c == '\r'); + } + } else i--; + } + else if(next == UINT16_PACK('\n', '=')) { + i += 3; + YDEC_CHECK_END(YDEC_STATE_CRLFEQ) + if(es[i] == 'y') { + // ended + *src = es + i + 1; + *dest = p; + *state = YDEC_STATE_NONE; + return 1; + } else { + // escape char & continue + c = es[i]; + *p++ = c - 42 - 64; + i -= (c == '\r'); + } + } + } case '\n': continue; case '=': - c = es[i + 1]; + c = es[i+1]; *p++ = c - 42 - 64; i += (c != '\r'); // if we have a \r, reprocess character to deal with \r\n. case continue; @@ -88,20 +167,22 @@ size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len, *p++ = c - 42; } } - if (state) *state = 3; - - if (i == -2) { // 2nd last char + if(state) *state = YDEC_STATE_NONE; + + if(i == -2) { // 2nd last char c = es[i]; - switch (c) { + switch(c) { case '\r': - if (state && es[i + 1] == '\n') { - *state = 0; - return p - dest; + if(state && es[i+1] == '\n') { + *state = YDEC_STATE_CRLF; + *src = es; + *dest = p; + return 0; } case '\n': break; case '=': - c = es[i + 1]; + c = es[i+1]; *p++ = c - 42 - 64; i += (c != '\r'); break; @@ -110,21 +191,27 @@ size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len, } i++; } - + // do final char; we process this separately to prevent an overflow if the final char is '=' - if (i == -1) { + if(i == -1) { c = es[i]; - if (c != '\n' && c != '\r' && c != '=') { + if(c != '\n' && c != '\r' && c != '=') { *p++ = c - 42; - } - else if (state) { - if (c == '=') *state = 1; - else if (c == '\r') *state = 2; - else *state = 3; + } else if(state) { + if(c == '=') *state = YDEC_STATE_EQ; + else if(c == '\r') *state = YDEC_STATE_CR; + else *state = YDEC_STATE_NONE; } } +#undef YDEC_CHECK_END + + *src = es; + *dest = p; + return 0; +} - return p - dest; +void init_decode_scalar() { + decode = decode_scalar; } } diff --git a/lib/yencode/SimdDecoder.cpp b/lib/yencode/SimdDecoder.cpp new file mode 100644 index 00000000..98867175 --- /dev/null +++ b/lib/yencode/SimdDecoder.cpp @@ -0,0 +1,686 @@ +/* + * Based on node-yencode library by Anime Tosho: + * https://github.com/animetosho/node-yencode + * + * Copyright (C) 2017 Anime Tosho (animetosho) + * Copyright (C) 2017 Andrey Prygunkov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#ifdef SIMD_DECODER + +#ifdef WIN32 +#define FORCE_INLINE __forceinline +#else +#define FORCE_INLINE __attribute__((always_inline)) +#endif + +// combine two 8-bit ints into a 16-bit one +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define UINT16_PACK(a, b) ((a) | ((b) << 8)) +#define UINT32_PACK(a, b, c, d) ((a) | ((b) << 8) | ((c) << 16) | ((d) << 24)) +#else +#define UINT16_PACK(a, b) (((a) << 8) | (b)) +#define UINT32_PACK(a, b, c, d) (((a) << 24) | ((b) << 16) | ((c) << 8) | (d)) +#endif + +// table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable +static const unsigned char BitsSetTable256[256] = +{ +# define B2(n) n, n+1, n+1, n+2 +# define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2) +# define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2) + B6(0), B6(1), B6(1), B6(2) +#undef B2 +#undef B4 +#undef B6 +}; + +template +int do_decode_simd(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state) { + if(len <= width*2) return decode_scalar(src, dest, len, state); + + YencDecoderState tState = YDEC_STATE_CRLF; + YencDecoderState* pState = state ? state : &tState; + if((uintptr_t)(*src) & ((width-1))) { + // find source memory alignment + unsigned char* aSrc = (unsigned char*)(((uintptr_t)(*src) + (width-1)) & ~(width-1)); + int amount = (int)(aSrc - *src); + len -= amount; + int ended = decode_scalar(src, dest, amount, pState); + if(ended) return ended; + } + + size_t lenBuffer = width -1; + lenBuffer += 3 + 1; + + if(len > lenBuffer) { + unsigned char *p = *dest; // destination pointer + unsigned char escFirst = 0; // input character; first char needs escaping + uint16_t nextMask = 0; + // handle finicky case of special sequences straddled across initial boundary + switch(*pState) { + case YDEC_STATE_CRLF: + if(**src == '.') { + nextMask = 1; + if(*(uint16_t*)(*src +1) == UINT16_PACK('\r','\n')) { + (*src) += 3; + *pState = YDEC_STATE_CRLF; + return 2; + } + if(*(uint16_t*)(*src +1) == UINT16_PACK('=','y')) { + (*src) += 3; + *pState = YDEC_STATE_NONE; + return 1; + } + } + else if(*(uint16_t*)(*src) == UINT16_PACK('=','y')) { + (*src) += 2; + *pState = YDEC_STATE_NONE; + return 1; + } + break; + case YDEC_STATE_CR: + if(*(uint16_t*)(*src) == UINT16_PACK('\n','.')) { + nextMask = 2; + if(*(uint16_t*)(*src +2) == UINT16_PACK('\r','\n')) { + (*src) += 4; + *pState = YDEC_STATE_CRLF; + return 2; + } + if(*(uint16_t*)(*src +2) == UINT16_PACK('=','y')) { + (*src) += 4; + *pState = YDEC_STATE_NONE; + return 1; + } + } + else if((*(uint32_t*)(*src) & 0xffffff) == UINT32_PACK('\n','=','y',0)) { + (*src) += 3; + *pState = YDEC_STATE_NONE; + return 1; + } + break; + case YDEC_STATE_CRLFDT: + if(*(uint16_t*)(*src) == UINT16_PACK('\r','\n')) { + (*src) += 2; + *pState = YDEC_STATE_CRLF; + return 2; + } + if(*(uint16_t*)(*src) == UINT16_PACK('=','y')) { + (*src) += 2; + *pState = YDEC_STATE_NONE; + return 1; + } + break; + case YDEC_STATE_CRLFDTCR: + if(**src == '\n') { + (*src) += 1; + *pState = YDEC_STATE_CRLF; + return 2; + } + break; + case YDEC_STATE_CRLFEQ: + if(**src == 'y') { + (*src) += 1; + *pState = YDEC_STATE_NONE; + return 1; + } + break; + default: break; // silence compiler warning + } + escFirst = (*pState == YDEC_STATE_EQ || *pState == YDEC_STATE_CRLFEQ); + + // our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking) + size_t dLen = len - lenBuffer; + dLen = (dLen + (width-1)) & ~(width-1); + const uint8_t* dSrc = (const uint8_t*)(*src) + dLen; + + Kernel::do_decode(dLen, dSrc, p, escFirst, nextMask); + + if(escFirst) *pState = YDEC_STATE_EQ; // escape next character + else if(nextMask == 1) *pState = YDEC_STATE_CRLF; // next character is '.', where previous two were \r\n + else if(nextMask == 2) *pState = YDEC_STATE_CR; // next characters are '\n.', previous is \r + else *pState = YDEC_STATE_NONE; + + *src += dLen; + len -= dLen; + *dest = p; + } + + // end alignment + if(len) + return decode_scalar(src, dest, len, pState); + + return 0; +} + +alignas(32) static const uint8_t pshufb_combine_table[272] = { + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80, + 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80, + 0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80, + 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80, + 0x00,0x01,0x02,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80, + 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80, + 0x00,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80, + 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, +}; + +#ifdef __SSE2__ + +#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/ + +#if defined(__tune_core2__) || defined(__tune_atom__) +/* on older Intel CPUs, plus first gen Atom, it is faster to store XMM registers in half */ +# define STOREU_XMM(dest, xmm) \ + _mm_storel_epi64((__m128i*)(dest), xmm); \ + _mm_storeh_pi(((__m64*)(dest) +1), _mm_castsi128_ps(xmm)) +#else +# define STOREU_XMM(dest, xmm) \ + _mm_storeu_si128((__m128i*)(dest), xmm) +#endif + +#define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \ + _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \ + (b) \ +)) + +uint8_t eqFixLUT[256]; +alignas(32) __m64 eqAddLUT[256]; +#ifdef __SSSE3__ +alignas(32) __m64 unshufLUT[256]; +#endif + +template +struct do_decode_sse { +FORCE_INLINE +static inline void do_decode(size_t& dLen, const uint8_t* dSrc, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) { + long dI = -(long)dLen; + + for(; dI; dI += sizeof(__m128i)) { + const uint8_t* src = dSrc + dI; + + __m128i data = _mm_load_si128((__m128i *)src); + + // search for special chars + __m128i cmpEq = _mm_cmpeq_epi8(data, _mm_set1_epi8('=')), +#ifdef __AVX512VL__ + cmp = _mm_ternarylogic_epi32( + _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), + _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a)), + cmpEq, + 0xFE + ); +#else + cmp = _mm_or_si128( + _mm_or_si128( + _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), // \r\n + _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a)) // \n\r + ), + cmpEq + ); +#endif + uint16_t mask = _mm_movemask_epi8(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later + + __m128i oData; + if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers? + // first byte needs escaping due to preceeding = in last loop iteration + oData = _mm_sub_epi8(data, _mm_set_epi8(42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42+64)); + mask &= ~1; + } else { + oData = _mm_sub_epi8(data, _mm_set1_epi8(42)); + } + mask |= nextMask; + + if (mask != 0) { + // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant + // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that + + // firstly, resolve invalid sequences of = to deal with cases like '====' + uint16_t maskEq = _mm_movemask_epi8(cmpEq); + uint16_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst]; + maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp; + + unsigned char oldEscFirst = escFirst; + escFirst = (maskEq >> (sizeof(__m128i)-1)); + // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed + maskEq <<= 1; + mask &= ~maskEq; + + // unescape chars following `=` +#if defined(__AVX512VL__) && defined(__AVX512BW__) + // GCC < 7 seems to generate rubbish assembly for this + oData = _mm_mask_add_epi8( + oData, + maskEq, + oData, + _mm_set1_epi8(-64) + ); +#else + oData = _mm_add_epi8( + oData, + LOAD_HALVES( + eqAddLUT + (maskEq&0xff), + eqAddLUT + ((maskEq>>8)&0xff) + ) + ); +#endif + + // handle \r\n. sequences + // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing + // find instances of \r\n + __m128i tmpData1, tmpData2, tmpData3, tmpData4; +#if defined(__SSSE3__) && !defined(__tune_btver1__) + if(use_ssse3) { + __m128i nextData = _mm_load_si128((__m128i *)src + 1); + tmpData1 = _mm_alignr_epi8(nextData, data, 1); + tmpData2 = _mm_alignr_epi8(nextData, data, 2); + tmpData3 = _mm_alignr_epi8(nextData, data, 3); + tmpData4 = _mm_alignr_epi8(nextData, data, 4); + } else { +#endif + tmpData1 = _mm_insert_epi16(_mm_srli_si128(data, 1), *(uint16_t*)(src + sizeof(__m128i)-1), 7); + tmpData2 = _mm_insert_epi16(_mm_srli_si128(data, 2), *(uint16_t*)(src + sizeof(__m128i)), 7); + tmpData3 = _mm_insert_epi16(_mm_srli_si128(tmpData1, 2), *(uint16_t*)(src + sizeof(__m128i)+1), 7); + tmpData4 = _mm_insert_epi16(_mm_srli_si128(tmpData2, 2), *(uint16_t*)(src + sizeof(__m128i)+2), 7); +#ifdef __SSSE3__ + } +#endif + __m128i matchNl1 = _mm_cmpeq_epi16(data, _mm_set1_epi16(0x0a0d)); + __m128i matchNl2 = _mm_cmpeq_epi16(tmpData1, _mm_set1_epi16(0x0a0d)); + + __m128i matchDots, matchNlDots; + uint16_t killDots; + matchDots = _mm_cmpeq_epi8(tmpData2, _mm_set1_epi8('.')); + // merge preparation (for non-raw, it doesn't matter if this is shifted or not) + matchNl1 = _mm_srli_si128(matchNl1, 1); + + // merge matches of \r\n with those for . +#ifdef __AVX512VL__ + matchNlDots = _mm_ternarylogic_epi32(matchDots, matchNl1, matchNl2, 0xE0); +#else + matchNlDots = _mm_and_si128(matchDots, _mm_or_si128(matchNl1, matchNl2)); +#endif + killDots = _mm_movemask_epi8(matchNlDots); + + __m128i cmpB1 = _mm_cmpeq_epi16(tmpData2, _mm_set1_epi16(0x793d)); // "=y" + __m128i cmpB2 = _mm_cmpeq_epi16(tmpData3, _mm_set1_epi16(0x793d)); + if(killDots) { + // match instances of \r\n.\r\n and \r\n.=y + __m128i cmpC1 = _mm_cmpeq_epi16(tmpData3, _mm_set1_epi16(0x0a0d)); // "\r\n" + __m128i cmpC2 = _mm_cmpeq_epi16(tmpData4, _mm_set1_epi16(0x0a0d)); + cmpC1 = _mm_or_si128(cmpC1, cmpB2); + cmpC2 = _mm_or_si128(cmpC2, _mm_cmpeq_epi16(tmpData4, _mm_set1_epi16(0x793d))); + cmpC2 = _mm_slli_si128(cmpC2, 1); + + // prepare cmpB + cmpB1 = _mm_and_si128(cmpB1, matchNl1); + cmpB2 = _mm_and_si128(cmpB2, matchNl2); + + // and w/ dots +#ifdef __AVX512VL__ + cmpC1 = _mm_ternarylogic_epi32(cmpC1, cmpC2, matchNlDots, 0xA8); + cmpB1 = _mm_ternarylogic_epi32(cmpB1, cmpB2, cmpC1, 0xFE); +#else + cmpC1 = _mm_and_si128(_mm_or_si128(cmpC1, cmpC2), matchNlDots); + cmpB1 = _mm_or_si128(cmpC1, _mm_or_si128( + cmpB1, cmpB2 + )); +#endif + } else { +#ifdef __AVX512VL__ + cmpB1 = _mm_ternarylogic_epi32(cmpB1, matchNl1, _mm_and_si128(cmpB2, matchNl2), 0xEA); +#else + cmpB1 = _mm_or_si128( + _mm_and_si128(cmpB1, matchNl1), + _mm_and_si128(cmpB2, matchNl2) + ); +#endif + } + if(_mm_movemask_epi8(cmpB1)) { + // terminator found + // there's probably faster ways to do this, but reverting to scalar code should be good enough + escFirst = oldEscFirst; + dLen += dI; + return; + } + mask |= (killDots << 2) & 0xffff; + nextMask = killDots >> (sizeof(__m128i)-2); + + // all that's left is to 'compress' the data (skip over masked chars) +#ifdef __SSSE3__ + if(use_ssse3) { +# if defined(__POPCNT__) && (defined(__tune_znver1__) || defined(__tune_btver2__)) + unsigned char skipped = _mm_popcnt_u32(mask & 0xff); +# else + unsigned char skipped = BitsSetTable256[mask & 0xff]; +# endif + // lookup compress masks and shuffle + // load up two halves + __m128i shuf = LOAD_HALVES(unshufLUT + (mask&0xff), unshufLUT + (mask>>8)); + + // offset upper half by 8 + shuf = _mm_add_epi8(shuf, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); + // shift down upper half into lower + // TODO: consider using `mask & 0xff` in table instead of counting bits + shuf = _mm_shuffle_epi8(shuf, _mm_load_si128((const __m128i*)pshufb_combine_table + skipped)); + + // shuffle data + oData = _mm_shuffle_epi8(oData, shuf); + STOREU_XMM(p, oData); + + // increment output position +# if defined(__POPCNT__) && !defined(__tune_btver1__) + p += XMM_SIZE - _mm_popcnt_u32(mask); +# else + p += XMM_SIZE - skipped - BitsSetTable256[mask >> 8]; +# endif + + } else { +#endif + alignas(32) uint32_t mmTmp[4]; + _mm_store_si128((__m128i*)mmTmp, oData); + + for(int j=0; j<4; j++) { + if(mask & 0xf) { + unsigned char* pMmTmp = (unsigned char*)(mmTmp + j); + unsigned int maskn = ~mask; + *p = pMmTmp[0]; + p += (maskn & 1); + *p = pMmTmp[1]; + p += (maskn & 2) >> 1; + *p = pMmTmp[2]; + p += (maskn & 4) >> 2; + *p = pMmTmp[3]; + p += (maskn & 8) >> 3; + } else { + *(uint32_t*)p = mmTmp[j]; + p += 4; + } + mask >>= 4; + } +#ifdef __SSSE3__ + } +#endif + } else { + STOREU_XMM(p, oData); + p += XMM_SIZE; + escFirst = 0; + nextMask = 0; + } + } +} +}; +#endif + + +#ifdef __ARM_NEON +inline uint16_t neon_movemask(uint8x16_t in) { + uint8x16_t mask = vandq_u8(in, (uint8x16_t){1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128}); +# if defined(__aarch64__) + return (vaddv_u8(vget_high_u8(mask)) << 8) | vaddv_u8(vget_low_u8(mask)); +# else + uint8x8_t res = vpadd_u8(vget_low_u8(mask), vget_high_u8(mask)); + res = vpadd_u8(res, res); + res = vpadd_u8(res, res); + return vget_lane_u16(vreinterpret_u16_u8(res), 0); +# endif +} + +uint8_t eqFixLUT[256]; +alignas(32) uint8x8_t eqAddLUT[256]; +alignas(32) uint8x8_t unshufLUT[256]; + +struct do_decode_neon { +FORCE_INLINE +static inline void do_decode(size_t& dLen, const uint8_t* dSrc, unsigned char*& p, unsigned char& escFirst, uint16_t& nextMask) { + long dI = -(long)dLen; + + for(; dI; dI += sizeof(uint8x16_t)) { + const uint8_t* src = dSrc + dI; + + uint8x16_t data = vld1q_u8(src); + + // search for special chars + uint8x16_t cmpEq = vceqq_u8(data, vdupq_n_u8('=')), + cmp = vorrq_u8( + vorrq_u8( + vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0a0d))), // \r\n + vceqq_u8(data, vreinterpretq_u8_u16(vdupq_n_u16(0x0d0a))) // \n\r + ), + cmpEq + ); + uint16_t mask = neon_movemask(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later + + uint8x16_t oData; + if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers? + // first byte needs escaping due to preceeding = in last loop iteration + oData = vsubq_u8(data, (uint8x16_t){42+64,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42}); + mask &= ~1; + } else { + oData = vsubq_u8(data, vdupq_n_u8(42)); + } + mask |= nextMask; + + if (mask != 0) { + // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant + // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that + + // firstly, resolve invalid sequences of = to deal with cases like '====' + uint16_t maskEq = neon_movemask(cmpEq); + uint16_t tmp = eqFixLUT[(maskEq&0xff) & ~escFirst]; + maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp; + + unsigned char oldEscFirst = escFirst; + escFirst = (maskEq >> (sizeof(uint8x16_t)-1)); + // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed + maskEq <<= 1; + mask &= ~maskEq; + + // unescape chars following `=` + oData = vaddq_u8( + oData, + vcombine_u8( + vld1_u8((uint8_t*)(eqAddLUT + (maskEq&0xff))), + vld1_u8((uint8_t*)(eqAddLUT + ((maskEq>>8)&0xff))) + ) + ); + + // handle \r\n. sequences + // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing + // find instances of \r\n + uint8x16_t tmpData1, tmpData2, tmpData3, tmpData4; + uint8x16_t nextData = vld1q_u8(src + sizeof(uint8x16_t)); + tmpData1 = vextq_u8(data, nextData, 1); + tmpData2 = vextq_u8(data, nextData, 2); + tmpData3 = vextq_u8(data, nextData, 3); + tmpData4 = vextq_u8(data, nextData, 4); + uint8x16_t matchNl1 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(data), vdupq_n_u16(0x0a0d))); + uint8x16_t matchNl2 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData1), vdupq_n_u16(0x0a0d))); + + uint8x16_t matchDots, matchNlDots; + uint16_t killDots; + matchDots = vceqq_u8(tmpData2, vdupq_n_u8('.')); + // merge preparation (for non-raw, it doesn't matter if this is shifted or not) + matchNl1 = vextq_u8(matchNl1, vdupq_n_u8(0), 1); + + // merge matches of \r\n with those for . + matchNlDots = vandq_u8(matchDots, vorrq_u8(matchNl1, matchNl2)); + killDots = neon_movemask(matchNlDots); + + uint8x16_t cmpB1 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData2), vdupq_n_u16(0x793d))); // "=y" + uint8x16_t cmpB2 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData3), vdupq_n_u16(0x793d))); + if(killDots) { + // match instances of \r\n.\r\n and \r\n.=y + uint8x16_t cmpC1 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData3), vdupq_n_u16(0x0a0d))); + uint8x16_t cmpC2 = vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData4), vdupq_n_u16(0x0a0d))); + cmpC1 = vorrq_u8(cmpC1, cmpB2); + cmpC2 = vorrq_u8(cmpC2, vreinterpretq_u8_u16(vceqq_u16(vreinterpretq_u16_u8(tmpData4), vdupq_n_u16(0x793d)))); + cmpC2 = vextq_u8(vdupq_n_u8(0), cmpC2, 15); + cmpC1 = vorrq_u8(cmpC1, cmpC2); + + // and w/ dots + cmpC1 = vandq_u8(cmpC1, matchNlDots); + // then merge w/ cmpB + cmpB1 = vandq_u8(cmpB1, matchNl1); + cmpB2 = vandq_u8(cmpB2, matchNl2); + + cmpB1 = vorrq_u8(cmpC1, vorrq_u8( + cmpB1, cmpB2 + )); + } else { + cmpB1 = vorrq_u8( + vandq_u8(cmpB1, matchNl1), + vandq_u8(cmpB2, matchNl2) + ); + } +#ifdef __aarch64__ + if(vget_lane_u64(vqmovn_u64(vreinterpretq_u64_u8(cmpB1)), 0)) +#else + uint32x4_t tmp1 = vreinterpretq_u32_u8(cmpB1); + uint32x2_t tmp2 = vorr_u32(vget_low_u32(tmp1), vget_high_u32(tmp1)); + if(vget_lane_u32(vpmax_u32(tmp2, tmp2), 0)) +#endif + { + // terminator found + // there's probably faster ways to do this, but reverting to scalar code should be good enough + escFirst = oldEscFirst; + dLen += dI; + return; + } + mask |= (killDots << 2) & 0xffff; + nextMask = killDots >> (sizeof(uint8x16_t)-2); + + // all that's left is to 'compress' the data (skip over masked chars) + unsigned char skipped = BitsSetTable256[mask & 0xff]; + // lookup compress masks and shuffle + oData = vcombine_u8( + vtbl1_u8(vget_low_u8(oData), vld1_u8((uint8_t*)(unshufLUT + (mask&0xff)))), + vtbl1_u8(vget_high_u8(oData), vld1_u8((uint8_t*)(unshufLUT + (mask>>8)))) + ); + // compact down + uint8x16_t compact = vld1q_u8(pshufb_combine_table + skipped*sizeof(uint8x16_t)); +#ifdef __aarch64__ + oData = vqtbl1q_u8(oData, compact); +#else + uint8x8x2_t dataH = {vget_low_u8(oData), vget_high_u8(oData)}; + oData = vcombine_u8(vtbl2_u8(dataH, vget_low_u8(compact)), + vtbl2_u8(dataH, vget_high_u8(compact))); +#endif + vst1q_u8(p, oData); + + // increment output position + p += sizeof(uint8x16_t) - skipped - BitsSetTable256[mask >> 8]; + + } else { + vst1q_u8(p, oData); + p += sizeof(uint8x16_t); + escFirst = 0; + nextMask = 0; + } + } +} +}; +#endif + +void decoder_init() { +#ifdef __SSE2__ + for(int i=0; i<256; i++) { + int k = i; + uint8_t res[8]; + int p = 0; + + // fix LUT + k = i; + p = 0; + for(int j=0; j<8; j++) { + k = i >> j; + if(k & 1) { + p |= 1 << j; + j++; + } + } + eqFixLUT[i] = p; + + // sub LUT + k = i; + for(int j=0; j<8; j++) { + res[j] = (k & 1) ? 192 /* == -64 */ : 0; + k >>= 1; + } + _mm_storel_epi64((__m128i*)(eqAddLUT + i), _mm_loadl_epi64((__m128i*)res)); + } +#endif + +#ifdef __SSSE3__ + // generate unshuf LUT + for(int i=0; i<256; i++) { + int k = i; + uint8_t res[8]; + int p = 0; + for(int j=0; j<8; j++) { + if(!(k & 1)) { + res[p++] = j; + } + k >>= 1; + } + for(; p<8; p++) + res[p] = 0; + _mm_storel_epi64((__m128i*)(unshufLUT + i), _mm_loadl_epi64((__m128i*)res)); + } +#endif + +#ifdef __ARM_NEON + for(int i=0; i<256; i++) { + int k = i; + uint8_t res[8]; + int p = 0; + + // fix LUT + k = i; + p = 0; + for(int j=0; j<8; j++) { + k = i >> j; + if(k & 1) { + p |= 1 << j; + j++; + } + } + eqFixLUT[i] = p; + + // sub LUT + k = i; + for(int j=0; j<8; j++) { + res[j] = (k & 1) ? 192 /* == -64 */ : 0; + k >>= 1; + } + vst1_u8((uint8_t*)(eqAddLUT + i), vld1_u8(res)); + + k = i; + p = 0; + for(int j=0; j<8; j++) { + if(!(k & 1)) { + res[p++] = j; + } + k >>= 1; + } + for(; p<8; p++) + res[p] = 0; + vst1_u8((uint8_t*)(unshufLUT + i), vld1_u8(res)); + } +#endif +} +#endif diff --git a/lib/yencode/SimdInit.cpp b/lib/yencode/SimdInit.cpp index 3d7c8140..aaae50cb 100644 --- a/lib/yencode/SimdInit.cpp +++ b/lib/yencode/SimdInit.cpp @@ -31,28 +31,19 @@ namespace YEncode { -size_t (*decode)(const unsigned char*, unsigned char*, size_t, char* state) = nullptr; +int (*decode)(const unsigned char**, unsigned char**, size_t, YencDecoderState*) = nullptr; +extern void init_decode_scalar(); bool decode_simd = false; void (*crc_init)(crc_state *const s) = nullptr; void (*crc_incr)(crc_state *const s, const unsigned char *src, long len) = nullptr; uint32_t (*crc_finish)(crc_state *const s) = nullptr; +extern void init_crc_slice(); bool crc_simd = false; -void crc_slice_init(crc_state *const s); -void crc_slice(crc_state *const s, const unsigned char *src, long len); -uint32_t crc_slice_finish(crc_state *const s); - #if defined(__i686__) || defined(__amd64__) -size_t (*decode_sse2)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr; extern void init_decode_sse2(); - -size_t (*decode_ssse3)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr; extern void init_decode_ssse3(); - -void (*crc_init_pclmul)(crc_state *const s) = nullptr; -void (*crc_incr_pclmul)(crc_state *const s, const unsigned char *src, long len) = nullptr; -uint32_t (*crc_finish_pclmul)(crc_state *const s) = nullptr; extern void init_crc_pclmul(); class CpuId @@ -75,21 +66,14 @@ public: #endif #if defined(__arm__) || defined(__aarch64__) -size_t (*decode_neon)(const unsigned char* src, unsigned char* dest, size_t len, char* state) = nullptr; extern void init_decode_neon(); - -void (*crc_init_acle)(crc_state *const s) = nullptr; -void (*crc_incr_acle)(crc_state *const s, const unsigned char *src, long len) = nullptr; -uint32_t (*crc_finish_acle)(crc_state *const s) = nullptr; extern void init_crc_acle(); #endif void init() { - decode = &decode_scalar; - crc_init = &crc_slice_init; - crc_incr = &crc_slice; - crc_finish = &crc_slice_finish; + init_decode_scalar(); + init_crc_slice(); #if defined(__i686__) || defined(__amd64__) CpuId cpuid(1); @@ -102,31 +86,14 @@ void init() if (cpu_supports_sse2) { init_decode_sse2(); - if (decode_sse2) - { - decode = decode_sse2; - decode_simd = true; - } } if (cpu_supports_ssse3) { init_decode_ssse3(); - if (decode_ssse3) - { - decode = decode_ssse3; - decode_simd = true; - } } if (cpu_supports_sse41 && cpu_supports_pclmul) { init_crc_pclmul(); - if (crc_init_pclmul && crc_incr_pclmul && crc_finish_pclmul) - { - crc_init = crc_init_pclmul; - crc_incr = crc_incr_pclmul; - crc_finish = crc_finish_pclmul; - crc_simd = true; - } } #endif @@ -151,22 +118,10 @@ void init() if (cpu_supports_neon) { init_decode_neon(); - if (decode_neon) - { - decode = decode_neon; - decode_simd = true; - } } if (cpu_supports_crc) { init_crc_acle(); - if (crc_init_acle && crc_incr_acle && crc_finish_acle) - { - crc_init = crc_init_acle; - crc_incr = crc_incr_acle; - crc_finish = crc_finish_acle; - crc_simd = true; - } } #endif } diff --git a/lib/yencode/SliceCrc.cpp b/lib/yencode/SliceCrc.cpp index fc35426c..a9684d49 100644 --- a/lib/yencode/SliceCrc.cpp +++ b/lib/yencode/SliceCrc.cpp @@ -206,4 +206,11 @@ uint32_t crc_slice_finish(crc_state *const s) return ~s->crc0[0]; } +void init_crc_slice() +{ + crc_init = &crc_slice_init; + crc_incr = &crc_slice; + crc_finish = &crc_slice_finish; +} + } diff --git a/lib/yencode/Sse2Decoder.cpp b/lib/yencode/Sse2Decoder.cpp index a3c47292..0f0971c8 100644 --- a/lib/yencode/Sse2Decoder.cpp +++ b/lib/yencode/Sse2Decoder.cpp @@ -30,200 +30,20 @@ namespace YEncode { + +namespace Sse2 +{ #ifdef __SSE2__ - -// combine two 8-bit ints into a 16-bit one -#define UINT16_PACK(a, b) ((a) | ((b) << 8)) - -#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/ - -#define STOREU_XMM(dest, xmm) \ - _mm_storeu_si128((__m128i*)(dest), xmm) - -#define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \ - _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \ - (b) \ -)) - -uint8_t eqFixLUT[256]; -alignas(32) __m64 eqAddLUT[256]; - -size_t do_decode_sse2(const unsigned char* src, unsigned char* dest, size_t len, char* state) { - if(len <= sizeof(__m128i)*2) return decode_scalar(src, dest, len, state); - - unsigned char *p = dest; // destination pointer - unsigned long i = 0; // input position - unsigned char escFirst = 0; // input character; first char needs escaping - unsigned int nextMask = 0; - char tState = 0; - char* pState = state ? state : &tState; - if((uintptr_t)src & ((sizeof(__m128i)-1))) { - // find source memory alignment - unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(__m128i)-1)) & ~(sizeof(__m128i)-1)); - - i = (unsigned long)(aSrc - src); - p += decode_scalar(src, dest, i, pState); - } - - if(*pState == 0 && i+1 < len && src[i] == '.') - nextMask = 1; - else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.')) - nextMask = 2; - - escFirst = *pState == 1; - - if(i + (sizeof(__m128i)+1) < len) { - // our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking) - size_t dLen = len - (sizeof(__m128i)+1); - dLen = ((dLen-i) + 0xf) & ~0xf; - unsigned char* dSrc = (unsigned char*)src + dLen + i; - long dI = -(long)dLen; - i += dLen; - - for(; dI; dI += sizeof(__m128i)) { - __m128i data = _mm_load_si128((__m128i *)(dSrc + dI)); - - // search for special chars - __m128i cmpEq = _mm_cmpeq_epi8(data, _mm_set1_epi8('=')), - cmp = _mm_or_si128( - _mm_or_si128( - _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), // \r\n - _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a)) // \n\r - ), - cmpEq - ); - - unsigned int mask = _mm_movemask_epi8(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later - - __m128i oData; - if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers? - // first byte needs escaping due to preceeding = in last loop iteration - oData = _mm_sub_epi8(data, _mm_set_epi8(42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42+64)); - } else { - oData = _mm_sub_epi8(data, _mm_set1_epi8(42)); - } - mask &= ~escFirst; - mask |= nextMask; - - if (mask != 0) { - // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant - // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that - - // firstly, resolve invalid sequences of = to deal with cases like '====' - unsigned int maskEq = _mm_movemask_epi8(cmpEq); - unsigned int tmp = eqFixLUT[(maskEq&0xff) & ~escFirst]; - maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp; - - escFirst = (maskEq >> (sizeof(__m128i)-1)); - // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed - maskEq <<= 1; - mask &= ~maskEq; - - // unescape chars following `=` - oData = _mm_add_epi8( - oData, - LOAD_HALVES( - eqAddLUT + (maskEq&0xff), - eqAddLUT + ((maskEq>>8)&0xff) - ) - ); - - // handle \r\n. sequences - // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing - // find instances of \r\n - __m128i tmpData1, tmpData2; - tmpData1 = _mm_insert_epi16(_mm_srli_si128(data, 1), *(uint16_t*)(dSrc + dI + sizeof(__m128i)-1), 7); - tmpData2 = _mm_insert_epi16(_mm_srli_si128(data, 2), *(uint16_t*)(dSrc + dI + sizeof(__m128i)), 7); - __m128i cmp1 = _mm_cmpeq_epi16(data, _mm_set1_epi16(0x0a0d)); - __m128i cmp2 = _mm_cmpeq_epi16(tmpData1, _mm_set1_epi16(0x0a0d)); - // prepare to merge the two comparisons - cmp1 = _mm_srli_si128(cmp1, 1); - // find all instances of . - tmpData2 = _mm_cmpeq_epi8(tmpData2, _mm_set1_epi8('.')); - // merge matches of \r\n with those for . - unsigned int killDots = _mm_movemask_epi8( - _mm_and_si128(tmpData2, _mm_or_si128(cmp1, cmp2)) - ); - mask |= (killDots << 2) & 0xffff; - nextMask = killDots >> (sizeof(__m128i)-2); - - // all that's left is to 'compress' the data (skip over masked chars) - alignas(32) uint32_t mmTmp[4]; - _mm_store_si128((__m128i*)mmTmp, oData); - - for(int j=0; j<4; j++) { - if(mask & 0xf) { - unsigned char* pMmTmp = (unsigned char*)(mmTmp + j); - unsigned int maskn = ~mask; - *p = pMmTmp[0]; - p += (maskn & 1); - *p = pMmTmp[1]; - p += (maskn & 2) >> 1; - *p = pMmTmp[2]; - p += (maskn & 4) >> 2; - *p = pMmTmp[3]; - p += (maskn & 8) >> 3; - } else { - *(uint32_t*)p = mmTmp[j]; - p += 4; - } - mask >>= 4; - } - } else { - STOREU_XMM(p, oData); - p += XMM_SIZE; - escFirst = 0; - nextMask = 0; - } - } - - if(escFirst) *pState = 1; // escape next character - else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n - else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r - else *pState = 3; - } - - // end alignment - if(i < len) { - p += decode_scalar(src + i, p, len - i, pState); - } - - return p - dest; -} - -extern size_t (*decode_sse2)(const unsigned char* src, unsigned char* dest, size_t len, char* state); +#define SIMD_DECODER +#include "SimdDecoder.cpp" #endif +} void init_decode_sse2() { #ifdef __SSE2__ - decode_sse2 = &do_decode_sse2; - - // generate unshuf LUT - for(int i=0; i<256; i++) { - int k = i; - uint8_t res[8]; - int p = 0; - - // fix LUT - k = i; - p = 0; - for(int j=0; j<8; j++) { - k = i >> j; - if(k & 1) { - p |= 1 << j; - j++; - } - } - eqFixLUT[i] = p; - - // sub LUT - k = i; - for(int j=0; j<8; j++) { - res[j] = (k & 1) ? 192 /* == -64 */ : 0; - k >>= 1; - } - _mm_storel_epi64((__m128i*)(eqAddLUT + i), _mm_loadl_epi64((__m128i*)res)); - } + decode = &YEncode::Sse2::do_decode_simd>; + YEncode::Sse2::decoder_init(); + decode_simd = true; #endif } diff --git a/lib/yencode/Ssse3Decoder.cpp b/lib/yencode/Ssse3Decoder.cpp index 15a971ee..aeb7b82b 100644 --- a/lib/yencode/Ssse3Decoder.cpp +++ b/lib/yencode/Ssse3Decoder.cpp @@ -30,213 +30,20 @@ namespace YEncode { -#ifdef __SSSE3__ -// combine two 8-bit ints into a 16-bit one -#define UINT16_PACK(a, b) ((a) | ((b) << 8)) - -#define XMM_SIZE 16 /*== (signed int)sizeof(__m128i)*/ - -#define STOREU_XMM(dest, xmm) \ - _mm_storeu_si128((__m128i*)(dest), xmm) - -#define LOAD_HALVES(a, b) _mm_castps_si128(_mm_loadh_pi( \ - _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)(a))), \ - (b) \ -)) - -// table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable -static const unsigned char BitsSetTable256[256] = +namespace Ssse3 { -# define B2(n) n, n+1, n+1, n+2 -# define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2) -# define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2) - B6(0), B6(1), B6(1), B6(2) -#undef B2 -#undef B4 -#undef B6 -}; - -extern uint8_t eqFixLUT[256]; -extern __m64 eqAddLUT[256]; - -alignas(32)__m64 unshufLUT[256]; -alignas(32) static const uint8_t _pshufb_combine_table[272] = { - 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, - 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80, - 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80, - 0x00,0x01,0x02,0x03,0x04,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80, - 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80, - 0x00,0x01,0x02,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80, - 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80, - 0x00,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80, - 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, -}; -static const __m128i* pshufb_combine_table = (const __m128i*)_pshufb_combine_table; - -size_t do_decode_ssse3(const unsigned char* src, unsigned char* dest, size_t len, char* state) { - if(len <= sizeof(__m128i)*2) return decode_scalar(src, dest, len, state); - - unsigned char *p = dest; // destination pointer - unsigned long i = 0; // input position - unsigned char escFirst = 0; // input character; first char needs escaping - unsigned int nextMask = 0; - char tState = 0; - char* pState = state ? state : &tState; - if((uintptr_t)src & ((sizeof(__m128i)-1))) { - // find source memory alignment - unsigned char* aSrc = (unsigned char*)(((uintptr_t)src + (sizeof(__m128i)-1)) & ~(sizeof(__m128i)-1)); - - i = (unsigned long)(aSrc - src); - p += decode_scalar(src, dest, i, pState); - } - - // handle finicky case of \r\n. straddled across initial boundary - if(*pState == 0 && i+1 < len && src[i] == '.') - nextMask = 1; - else if(*pState == 2 && i+2 < len && *(uint16_t*)(src + i) == UINT16_PACK('\n','.')) - nextMask = 2; - - escFirst = *pState == 1; - - if(i + (sizeof(__m128i)+1) < len) { - // our algorithm may perform an aligned load on the next part, of which we consider 2 bytes (for \r\n. sequence checking) - size_t dLen = len - (sizeof(__m128i)+1); - dLen = ((dLen-i) + 0xf) & ~0xf; - unsigned char* dSrc = (unsigned char*)src + dLen + i; - long dI = -(long)dLen; - i += dLen; - - for(; dI; dI += sizeof(__m128i)) { - __m128i data = _mm_load_si128((__m128i *)(dSrc + dI)); - - // search for special chars - __m128i cmpEq = _mm_cmpeq_epi8(data, _mm_set1_epi8('=')), - cmp = _mm_or_si128( - _mm_or_si128( - _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0a0d)), // \r\n - _mm_cmpeq_epi8(data, _mm_set1_epi16(0x0d0a)) // \n\r - ), - cmpEq - ); - - unsigned int mask = _mm_movemask_epi8(cmp); // not the most accurate mask if we have invalid sequences; we fix this up later - - __m128i oData; - if(escFirst) { // rarely hit branch: seems to be faster to use 'if' than a lookup table, possibly due to values being able to be held in registers? - // first byte needs escaping due to preceeding = in last loop iteration - oData = _mm_sub_epi8(data, _mm_set_epi8(42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42+64)); - } else { - oData = _mm_sub_epi8(data, _mm_set1_epi8(42)); - } - mask &= ~escFirst; - mask |= nextMask; - - if (mask != 0) { - // a spec compliant encoder should never generate sequences: ==, =\n and =\r, but we'll handle them to be spec compliant - // the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that - - // firstly, resolve invalid sequences of = to deal with cases like '====' - unsigned int maskEq = _mm_movemask_epi8(cmpEq); - unsigned int tmp = eqFixLUT[(maskEq&0xff) & ~escFirst]; - maskEq = (eqFixLUT[(maskEq>>8) & ~(tmp>>7)] << 8) | tmp; - - escFirst = (maskEq >> (sizeof(__m128i)-1)); - // next, eliminate anything following a `=` from the special char mask; this eliminates cases of `=\r` so that they aren't removed - maskEq <<= 1; - mask &= ~maskEq; - - // unescape chars following `=` - oData = _mm_add_epi8( - oData, - LOAD_HALVES( - eqAddLUT + (maskEq&0xff), - eqAddLUT + ((maskEq>>8)&0xff) - ) - ); - - // handle \r\n. sequences - // RFC3977 requires the first dot on a line to be stripped, due to dot-stuffing - // find instances of \r\n - __m128i tmpData1, tmpData2; - __m128i nextData = _mm_load_si128((__m128i *)(dSrc + dI) + 1); - tmpData1 = _mm_alignr_epi8(nextData, data, 1); - tmpData2 = _mm_alignr_epi8(nextData, data, 2); - __m128i cmp1 = _mm_cmpeq_epi16(data, _mm_set1_epi16(0x0a0d)); - __m128i cmp2 = _mm_cmpeq_epi16(tmpData1, _mm_set1_epi16(0x0a0d)); - // prepare to merge the two comparisons - cmp1 = _mm_srli_si128(cmp1, 1); - // find all instances of . - tmpData2 = _mm_cmpeq_epi8(tmpData2, _mm_set1_epi8('.')); - // merge matches of \r\n with those for . - unsigned int killDots = _mm_movemask_epi8( - _mm_and_si128(tmpData2, _mm_or_si128(cmp1, cmp2)) - ); - mask |= (killDots << 2) & 0xffff; - nextMask = killDots >> (sizeof(__m128i)-2); - - // all that's left is to 'compress' the data (skip over masked chars) - unsigned char skipped = BitsSetTable256[mask & 0xff]; - // lookup compress masks and shuffle - // load up two halves - __m128i shuf = LOAD_HALVES(unshufLUT + (mask&0xff), unshufLUT + (mask>>8)); - - // offset upper half by 8 - shuf = _mm_add_epi8(shuf, _mm_set_epi32(0x08080808, 0x08080808, 0, 0)); - // shift down upper half into lower - // TODO: consider using `mask & 0xff` in table instead of counting bits - shuf = _mm_shuffle_epi8(shuf, _mm_load_si128(pshufb_combine_table + skipped)); - - // shuffle data - oData = _mm_shuffle_epi8(oData, shuf); - STOREU_XMM(p, oData); - - // increment output position - p += XMM_SIZE - skipped - BitsSetTable256[mask >> 8]; - } else { - STOREU_XMM(p, oData); - p += XMM_SIZE; - escFirst = 0; - nextMask = 0; - } - } - - if(escFirst) *pState = 1; // escape next character - else if(nextMask == 1) *pState = 0; // next character is '.', where previous two were \r\n - else if(nextMask == 2) *pState = 2; // next characters are '\n.', previous is \r - else *pState = 3; - } - - // end alignment - if(i < len) { - p += decode_scalar(src + i, p, len - i, pState); - } - - return p - dest; -} - -extern size_t (*decode_ssse3)(const unsigned char* src, unsigned char* dest, size_t len, char* state); +#ifdef __SSSE3__ +#define SIMD_DECODER +#include "SimdDecoder.cpp" #endif +} void init_decode_ssse3() { #ifdef __SSSE3__ - decode_ssse3 = do_decode_ssse3; - - // generate unshuf LUT - for(int i=0; i<256; i++) { - int k = i; - uint8_t res[8]; - int p = 0; - for(int j=0; j<8; j++) { - if(!(k & 1)) { - res[p++] = j; - } - k >>= 1; - } - for(; p<8; p++) - res[p] = 0; - _mm_storel_epi64((__m128i*)(unshufLUT + i), _mm_loadl_epi64((__m128i*)res)); - } + decode = &YEncode::Ssse3::do_decode_simd>; + YEncode::Ssse3::decoder_init(); + decode_simd = true; #endif } diff --git a/lib/yencode/YEncode.h b/lib/yencode/YEncode.h index 7b4c9538..757c953d 100644 --- a/lib/yencode/YEncode.h +++ b/lib/yencode/YEncode.h @@ -27,8 +27,19 @@ namespace YEncode { void init(); -extern size_t (*decode)(const unsigned char* inbuf, unsigned char* outbuf, size_t, char* state); -size_t decode_scalar(const unsigned char* src, unsigned char* dest, size_t len, char* state); + +typedef enum : char { + YDEC_STATE_CRLF, // default + YDEC_STATE_EQ, + YDEC_STATE_CR, + YDEC_STATE_NONE, + YDEC_STATE_CRLFDT, + YDEC_STATE_CRLFDTCR, + YDEC_STATE_CRLFEQ // may actually be "\r\n.=" in raw decoder +} YencDecoderState; + +extern int (*decode)(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state); +extern int decode_scalar(const unsigned char** src, unsigned char** dest, size_t len, YencDecoderState* state); extern bool decode_simd; struct crc_state diff --git a/nzbget.vcxproj b/nzbget.vcxproj index 72d9a6b6..9f8517de 100755 --- a/nzbget.vcxproj +++ b/nzbget.vcxproj @@ -280,6 +280,7 @@ + @@ -382,6 +383,7 @@ +