diff options
Diffstat (limited to 'src/Common/lzma/LzFind.c')
-rw-r--r-- | src/Common/lzma/LzFind.c | 1717 |
1 files changed, 1717 insertions, 0 deletions
diff --git a/src/Common/lzma/LzFind.c b/src/Common/lzma/LzFind.c new file mode 100644 index 00000000..0fbd5aae --- /dev/null +++ b/src/Common/lzma/LzFind.c @@ -0,0 +1,1717 @@ +/* LzFind.c -- Match finder for LZ algorithms +2023-03-14 : Igor Pavlov : Public domain */ + +#include "Precomp.h" + +#include <string.h> +// #include <stdio.h> + +#include "CpuArch.h" +#include "LzFind.h" +#include "LzHash.h" + +#define kBlockMoveAlign (1 << 7) // alignment for memmove() +#define kBlockSizeAlign (1 << 16) // alignment for block allocation +#define kBlockSizeReserveMin (1 << 24) // it's 1/256 from 4 GB dictinary + +#define kEmptyHashValue 0 + +#define kMaxValForNormalize ((UInt32)0) +// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug + +// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses + +#define GET_AVAIL_BYTES(p) \ + Inline_MatchFinder_GetNumAvailableBytes(p) + + +// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) +#define kFix5HashSize kFix4HashSize + +/* + HASH2_CALC: + if (hv) match, then cur[0] and cur[1] also match +*/ +#define HASH2_CALC hv = GetUi16(cur); + +// (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255] + +/* + HASH3_CALC: + if (cur[0]) and (h2) match, then cur[1] also match + if (cur[0]) and (hv) match, then cur[1] and cur[2] also match +*/ +#define HASH3_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; } + +#define HASH4_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + temp ^= ((UInt32)cur[2] << 8); \ + h3 = temp & (kHash3Size - 1); \ + hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; } + +#define HASH5_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + temp ^= ((UInt32)cur[2] << 8); \ + h3 = temp & (kHash3Size - 1); \ + temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \ + /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \ + hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; } + +#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF; + + +static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc) +{ + // if (!p->directInput) + { + ISzAlloc_Free(alloc, p->bufBase); + p->bufBase = NULL; + } +} + + +static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc) +{ + if (blockSize == 0) + return 0; + if (!p->bufBase || p->blockSize != blockSize) + { + // size_t blockSizeT; + LzInWindow_Free(p, alloc); + p->blockSize = blockSize; + // blockSizeT = blockSize; + + // printf("\nblockSize = 0x%x\n", blockSize); + /* + #if defined _WIN64 + // we can allocate 4GiB, but still use UInt32 for (p->blockSize) + // we use UInt32 type for (p->blockSize), because + // we don't want to wrap over 4 GiB, + // when we use (p->streamPos - p->pos) that is UInt32. + if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign) + { + blockSizeT = ((size_t)1 << 32); + printf("\nchanged to blockSizeT = 4GiB\n"); + } + #endif + */ + + p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize); + // printf("\nbufferBase = %p\n", p->bufBase); + // return 0; // for debug + } + return (p->bufBase != NULL); +} + +static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } + +static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } + + +Z7_NO_INLINE +static void MatchFinder_ReadBlock(CMatchFinder *p) +{ + if (p->streamEndWasReached || p->result != SZ_OK) + return; + + /* We use (p->streamPos - p->pos) value. + (p->streamPos < p->pos) is allowed. */ + + if (p->directInput) + { + UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p); + if (curSize > p->directInputRem) + curSize = (UInt32)p->directInputRem; + p->streamPos += curSize; + p->directInputRem -= curSize; + if (p->directInputRem == 0) + p->streamEndWasReached = 1; + return; + } + + for (;;) + { + const Byte *dest = p->buffer + GET_AVAIL_BYTES(p); + size_t size = (size_t)(p->bufBase + p->blockSize - dest); + if (size == 0) + { + /* we call ReadBlock() after NeedMove() and MoveBlock(). + NeedMove() and MoveBlock() povide more than (keepSizeAfter) + to the end of (blockSize). + So we don't execute this branch in normal code flow. + We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock(). + */ + // p->result = SZ_ERROR_FAIL; // we can show error here + return; + } + + // #define kRead 3 + // if (size > kRead) size = kRead; // for debug + + /* + // we need cast (Byte *)dest. + #ifdef __clang__ + #pragma GCC diagnostic ignored "-Wcast-qual" + #endif + */ + p->result = ISeqInStream_Read(p->stream, + p->bufBase + (dest - p->bufBase), &size); + if (p->result != SZ_OK) + return; + if (size == 0) + { + p->streamEndWasReached = 1; + return; + } + p->streamPos += (UInt32)size; + if (GET_AVAIL_BYTES(p) > p->keepSizeAfter) + return; + /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function + (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */ + } + + // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter) +} + + + +Z7_NO_INLINE +void MatchFinder_MoveBlock(CMatchFinder *p) +{ + const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore; + const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore; + p->buffer = p->bufBase + keepBefore; + memmove(p->bufBase, + p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)), + keepBefore + (size_t)GET_AVAIL_BYTES(p)); +} + +/* We call MoveBlock() before ReadBlock(). + So MoveBlock() can be wasteful operation, if the whole input data + can fit in current block even without calling MoveBlock(). + in important case where (dataSize <= historySize) + condition (p->blockSize > dataSize + p->keepSizeAfter) is met + So there is no MoveBlock() in that case case. +*/ + +int MatchFinder_NeedMove(CMatchFinder *p) +{ + if (p->directInput) + return 0; + if (p->streamEndWasReached || p->result != SZ_OK) + return 0; + return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter); +} + +void MatchFinder_ReadIfRequired(CMatchFinder *p) +{ + if (p->keepSizeAfter >= GET_AVAIL_BYTES(p)) + MatchFinder_ReadBlock(p); +} + + + +static void MatchFinder_SetDefaultSettings(CMatchFinder *p) +{ + p->cutValue = 32; + p->btMode = 1; + p->numHashBytes = 4; + p->numHashBytes_Min = 2; + p->numHashOutBits = 0; + p->bigHash = 0; +} + +#define kCrcPoly 0xEDB88320 + +void MatchFinder_Construct(CMatchFinder *p) +{ + unsigned i; + p->buffer = NULL; + p->bufBase = NULL; + p->directInput = 0; + p->stream = NULL; + p->hash = NULL; + p->expectedDataSize = (UInt64)(Int64)-1; + MatchFinder_SetDefaultSettings(p); + + for (i = 0; i < 256; i++) + { + UInt32 r = (UInt32)i; + unsigned j; + for (j = 0; j < 8; j++) + r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1))); + p->crc[i] = r; + } +} + +#undef kCrcPoly + +static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc) +{ + ISzAlloc_Free(alloc, p->hash); + p->hash = NULL; +} + +void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc) +{ + MatchFinder_FreeThisClassMemory(p, alloc); + LzInWindow_Free(p, alloc); +} + +static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc) +{ + const size_t sizeInBytes = (size_t)num * sizeof(CLzRef); + if (sizeInBytes / sizeof(CLzRef) != num) + return NULL; + return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes); +} + +#if (kBlockSizeReserveMin < kBlockSizeAlign * 2) + #error Stop_Compiling_Bad_Reserve +#endif + + + +static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize) +{ + UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter); + /* + if (historySize > kMaxHistorySize) + return 0; + */ + // printf("\nhistorySize == 0x%x\n", historySize); + + if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore) // if 32-bit overflow + return 0; + + { + const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign; + const UInt32 rem = kBlockSizeMax - blockSize; + const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2)) + + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here + if (blockSize >= kBlockSizeMax + || rem < kBlockSizeReserveMin) // we reject settings that will be slow + return 0; + if (reserve >= rem) + blockSize = kBlockSizeMax; + else + { + blockSize += reserve; + blockSize &= ~(UInt32)(kBlockSizeAlign - 1); + } + } + // printf("\n LzFind_blockSize = %x\n", blockSize); + // printf("\n LzFind_blockSize = %d\n", blockSize >> 20); + return blockSize; +} + + +// input is historySize +static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs) +{ + if (p->numHashBytes == 2) + return (1 << 16) - 1; + if (hs != 0) + hs--; + hs |= (hs >> 1); + hs |= (hs >> 2); + hs |= (hs >> 4); + hs |= (hs >> 8); + // we propagated 16 bits in (hs). Low 16 bits must be set later + if (hs >= (1 << 24)) + { + if (p->numHashBytes == 3) + hs = (1 << 24) - 1; + /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ + } + // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) + hs |= (1 << 16) - 1; /* don't change it! */ + // bt5: we adjust the size with recommended minimum size + if (p->numHashBytes >= 5) + hs |= (256 << kLzHash_CrcShift_2) - 1; + return hs; +} + +// input is historySize +static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs) +{ + if (p->numHashBytes == 2) + return (1 << 16) - 1; + if (hs != 0) + hs--; + hs |= (hs >> 1); + hs |= (hs >> 2); + hs |= (hs >> 4); + hs |= (hs >> 8); + // we propagated 16 bits in (hs). Low 16 bits must be set later + hs >>= 1; + if (hs >= (1 << 24)) + { + if (p->numHashBytes == 3) + hs = (1 << 24) - 1; + else + hs >>= 1; + /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ + } + // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) + hs |= (1 << 16) - 1; /* don't change it! */ + // bt5: we adjust the size with recommended minimum size + if (p->numHashBytes >= 5) + hs |= (256 << kLzHash_CrcShift_2) - 1; + return hs; +} + + +int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, + UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, + ISzAllocPtr alloc) +{ + /* we need one additional byte in (p->keepSizeBefore), + since we use MoveBlock() after (p->pos++) and before dictionary using */ + // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug + p->keepSizeBefore = historySize + keepAddBufferBefore + 1; + + keepAddBufferAfter += matchMaxLen; + /* we need (p->keepSizeAfter >= p->numHashBytes) */ + if (keepAddBufferAfter < p->numHashBytes) + keepAddBufferAfter = p->numHashBytes; + // keepAddBufferAfter -= 2; // for debug + p->keepSizeAfter = keepAddBufferAfter; + + if (p->directInput) + p->blockSize = 0; + if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc)) + { + size_t hashSizeSum; + { + UInt32 hs; + UInt32 hsCur; + + if (p->numHashOutBits != 0) + { + unsigned numBits = p->numHashOutBits; + const unsigned nbMax = + (p->numHashBytes == 2 ? 16 : + (p->numHashBytes == 3 ? 24 : 32)); + if (numBits > nbMax) + numBits = nbMax; + if (numBits >= 32) + hs = (UInt32)0 - 1; + else + hs = ((UInt32)1 << numBits) - 1; + // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) + hs |= (1 << 16) - 1; /* don't change it! */ + if (p->numHashBytes >= 5) + hs |= (256 << kLzHash_CrcShift_2) - 1; + { + const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); + if (hs > hs2) + hs = hs2; + } + hsCur = hs; + if (p->expectedDataSize < historySize) + { + const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); + if (hsCur > hs2) + hsCur = hs2; + } + } + else + { + hs = MatchFinder_GetHashMask(p, historySize); + hsCur = hs; + if (p->expectedDataSize < historySize) + { + hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); + if (hsCur > hs) // is it possible? + hsCur = hs; + } + } + + p->hashMask = hsCur; + + hashSizeSum = hs; + hashSizeSum++; + if (hashSizeSum < hs) + return 0; + { + UInt32 fixedHashSize = 0; + if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size; + if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size; + // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size; + hashSizeSum += fixedHashSize; + p->fixedHashSize = fixedHashSize; + } + } + + p->matchMaxLen = matchMaxLen; + + { + size_t newSize; + size_t numSons; + const UInt32 newCyclicBufferSize = historySize + 1; // do not change it + p->historySize = historySize; + p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1) + + numSons = newCyclicBufferSize; + if (p->btMode) + numSons <<= 1; + newSize = hashSizeSum + numSons; + + if (numSons < newCyclicBufferSize || newSize < numSons) + return 0; + + // aligned size is not required here, but it can be better for some loops + #define NUM_REFS_ALIGN_MASK 0xF + newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK; + + // 22.02: we don't reallocate buffer, if old size is enough + if (p->hash && p->numRefs >= newSize) + return 1; + + MatchFinder_FreeThisClassMemory(p, alloc); + p->numRefs = newSize; + p->hash = AllocRefs(newSize, alloc); + + if (p->hash) + { + p->son = p->hash + hashSizeSum; + return 1; + } + } + } + + MatchFinder_Free(p, alloc); + return 0; +} + + +static void MatchFinder_SetLimits(CMatchFinder *p) +{ + UInt32 k; + UInt32 n = kMaxValForNormalize - p->pos; + if (n == 0) + n = (UInt32)(Int32)-1; // we allow (pos == 0) at start even with (kMaxValForNormalize == 0) + + k = p->cyclicBufferSize - p->cyclicBufferPos; + if (k < n) + n = k; + + k = GET_AVAIL_BYTES(p); + { + const UInt32 ksa = p->keepSizeAfter; + UInt32 mm = p->matchMaxLen; + if (k > ksa) + k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock + else if (k >= mm) + { + // the limitation for (p->lenLimit) update + k -= mm; // optimization : to reduce the number of checks + k++; + // k = 1; // non-optimized version : for debug + } + else + { + mm = k; + if (k != 0) + k = 1; + } + p->lenLimit = mm; + } + if (k < n) + n = k; + + p->posLimit = p->pos + n; +} + + +void MatchFinder_Init_LowHash(CMatchFinder *p) +{ + size_t i; + CLzRef *items = p->hash; + const size_t numItems = p->fixedHashSize; + for (i = 0; i < numItems; i++) + items[i] = kEmptyHashValue; +} + + +void MatchFinder_Init_HighHash(CMatchFinder *p) +{ + size_t i; + CLzRef *items = p->hash + p->fixedHashSize; + const size_t numItems = (size_t)p->hashMask + 1; + for (i = 0; i < numItems; i++) + items[i] = kEmptyHashValue; +} + + +void MatchFinder_Init_4(CMatchFinder *p) +{ + if (!p->directInput) + p->buffer = p->bufBase; + { + /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker. + the code in CMatchFinderMt expects (pos = 1) */ + p->pos = + p->streamPos = + 1; // it's smallest optimal value. do not change it + // 0; // for debug + } + p->result = SZ_OK; + p->streamEndWasReached = 0; +} + + +// (CYC_TO_POS_OFFSET == 0) is expected by some optimized code +#define CYC_TO_POS_OFFSET 0 +// #define CYC_TO_POS_OFFSET 1 // for debug + +void MatchFinder_Init(CMatchFinder *p) +{ + MatchFinder_Init_HighHash(p); + MatchFinder_Init_LowHash(p); + MatchFinder_Init_4(p); + // if (readData) + MatchFinder_ReadBlock(p); + + /* if we init (cyclicBufferPos = pos), then we can use one variable + instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */ + p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos) + // p->cyclicBufferPos = 0; // smallest value + // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses. + MatchFinder_SetLimits(p); +} + + + +#ifdef MY_CPU_X86_OR_AMD64 + #if defined(__clang__) && (__clang_major__ >= 4) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701) + // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) + + #define USE_LZFIND_SATUR_SUB_128 + #define USE_LZFIND_SATUR_SUB_256 + #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1"))) + #define LZFIND_ATTRIB_AVX2 __attribute__((__target__("avx2"))) + #elif defined(_MSC_VER) + #if (_MSC_VER >= 1600) + #define USE_LZFIND_SATUR_SUB_128 + #endif + #if (_MSC_VER >= 1900) + #define USE_LZFIND_SATUR_SUB_256 + #endif + #endif + +// #elif defined(MY_CPU_ARM_OR_ARM64) +#elif defined(MY_CPU_ARM64) + + #if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 8) + #define USE_LZFIND_SATUR_SUB_128 + #ifdef MY_CPU_ARM64 + // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) + #else + // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) + #endif + + #elif defined(_MSC_VER) + #if (_MSC_VER >= 1910) + #define USE_LZFIND_SATUR_SUB_128 + #endif + #endif + + #if defined(_MSC_VER) && defined(MY_CPU_ARM64) + #include <arm64_neon.h> + #else + #include <arm_neon.h> + #endif + +#endif + + +#ifdef USE_LZFIND_SATUR_SUB_128 + +// #define Z7_SHOW_HW_STATUS + +#ifdef Z7_SHOW_HW_STATUS +#include <stdio.h> +#define PRF(x) x +PRF(;) +#else +#define PRF(x) +#endif + + +#ifdef MY_CPU_ARM_OR_ARM64 + +#ifdef MY_CPU_ARM64 +// #define FORCE_LZFIND_SATUR_SUB_128 +#endif +typedef uint32x4_t LzFind_v128; +#define SASUB_128_V(v, s) \ + vsubq_u32(vmaxq_u32(v, s), s) + +#else // MY_CPU_ARM_OR_ARM64 + +#include <smmintrin.h> // sse4.1 + +typedef __m128i LzFind_v128; +// SSE 4.1 +#define SASUB_128_V(v, s) \ + _mm_sub_epi32(_mm_max_epu32(v, s), s) + +#endif // MY_CPU_ARM_OR_ARM64 + + +#define SASUB_128(i) \ + *( LzFind_v128 *)( void *)(items + (i) * 4) = SASUB_128_V( \ + *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2); + + +Z7_NO_INLINE +static +#ifdef LZFIND_ATTRIB_SSE41 +LZFIND_ATTRIB_SSE41 +#endif +void +Z7_FASTCALL +LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim) +{ + const LzFind_v128 sub2 = + #ifdef MY_CPU_ARM_OR_ARM64 + vdupq_n_u32(subValue); + #else + _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); + #endif + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SASUB_128(0) SASUB_128(1) items += 2 * 4; + SASUB_128(0) SASUB_128(1) items += 2 * 4; + } + while (items != lim); +} + + + +#ifdef USE_LZFIND_SATUR_SUB_256 + +#include <immintrin.h> // avx +/* +clang :immintrin.h uses +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX2__) +#include <avx2intrin.h> +#endif +so we need <avxintrin.h> for clang-cl */ + +#if defined(__clang__) +#include <avxintrin.h> +#include <avx2intrin.h> +#endif + +// AVX2: +#define SASUB_256(i) \ + *( __m256i *)( void *)(items + (i) * 8) = \ + _mm256_sub_epi32(_mm256_max_epu32( \ + *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); + +Z7_NO_INLINE +static +#ifdef LZFIND_ATTRIB_AVX2 +LZFIND_ATTRIB_AVX2 +#endif +void +Z7_FASTCALL +LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim) +{ + const __m256i sub2 = _mm256_set_epi32( + (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, + (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SASUB_256(0) SASUB_256(1) items += 2 * 8; + SASUB_256(0) SASUB_256(1) items += 2 * 8; + } + while (items != lim); +} +#endif // USE_LZFIND_SATUR_SUB_256 + +#ifndef FORCE_LZFIND_SATUR_SUB_128 +typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)( + UInt32 subValue, CLzRef *items, const CLzRef *lim); +static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub; +#endif // FORCE_LZFIND_SATUR_SUB_128 + +#endif // USE_LZFIND_SATUR_SUB_128 + + +// kEmptyHashValue must be zero +// #define SASUB_32(i) { UInt32 v = items[i]; UInt32 m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; } +#define SASUB_32(i) { UInt32 v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; } + +#ifdef FORCE_LZFIND_SATUR_SUB_128 + +#define DEFAULT_SaturSub LzFind_SaturSub_128 + +#else + +#define DEFAULT_SaturSub LzFind_SaturSub_32 + +Z7_NO_INLINE +static +void +Z7_FASTCALL +LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim) +{ + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SASUB_32(0) SASUB_32(1) items += 2; + SASUB_32(0) SASUB_32(1) items += 2; + SASUB_32(0) SASUB_32(1) items += 2; + SASUB_32(0) SASUB_32(1) items += 2; + } + while (items != lim); +} + +#endif + + +Z7_NO_INLINE +void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) +{ + #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7) + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--) + { + SASUB_32(0) + items++; + } + { + const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1); + CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask); + numItems &= k_Align_Mask; + if (items != lim) + { + #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128) + if (g_LzFind_SaturSub) + g_LzFind_SaturSub(subValue, items, lim); + else + #endif + DEFAULT_SaturSub(subValue, items, lim); + } + items = lim; + } + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + for (; numItems != 0; numItems--) + { + SASUB_32(0) + items++; + } +} + + + +// call MatchFinder_CheckLimits() only after (p->pos++) update + +Z7_NO_INLINE +static void MatchFinder_CheckLimits(CMatchFinder *p) +{ + if (// !p->streamEndWasReached && p->result == SZ_OK && + p->keepSizeAfter == GET_AVAIL_BYTES(p)) + { + // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p)) + if (MatchFinder_NeedMove(p)) + MatchFinder_MoveBlock(p); + MatchFinder_ReadBlock(p); + } + + if (p->pos == kMaxValForNormalize) + if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data. + /* + if we disable normalization for last bytes of data, and + if (data_size == 4 GiB), we don't call wastfull normalization, + but (pos) will be wrapped over Zero (0) in that case. + And we cannot resume later to normal operation + */ + { + // MatchFinder_Normalize(p); + /* after normalization we need (p->pos >= p->historySize + 1); */ + /* we can reduce subValue to aligned value, if want to keep alignment + of (p->pos) and (p->buffer) for speculated accesses. */ + const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */; + // const UInt32 subValue = (1 << 15); // for debug + // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue); + MatchFinder_REDUCE_OFFSETS(p, subValue) + MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize); + { + size_t numSonRefs = p->cyclicBufferSize; + if (p->btMode) + numSonRefs <<= 1; + MatchFinder_Normalize3(subValue, p->son, numSonRefs); + } + } + + if (p->cyclicBufferPos == p->cyclicBufferSize) + p->cyclicBufferPos = 0; + + MatchFinder_SetLimits(p); +} + + +/* + (lenLimit > maxLen) +*/ +Z7_FORCE_INLINE +static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, + UInt32 *d, unsigned maxLen) +{ + /* + son[_cyclicBufferPos] = curMatch; + for (;;) + { + UInt32 delta = pos - curMatch; + if (cutValue-- == 0 || delta >= _cyclicBufferSize) + return d; + { + const Byte *pb = cur - delta; + curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; + if (pb[maxLen] == cur[maxLen] && *pb == *cur) + { + UInt32 len = 0; + while (++len != lenLimit) + if (pb[len] != cur[len]) + break; + if (maxLen < len) + { + maxLen = len; + *d++ = len; + *d++ = delta - 1; + if (len == lenLimit) + return d; + } + } + } + } + */ + + const Byte *lim = cur + lenLimit; + son[_cyclicBufferPos] = curMatch; + + do + { + UInt32 delta; + + if (curMatch == 0) + break; + // if (curMatch2 >= curMatch) return NULL; + delta = pos - curMatch; + if (delta >= _cyclicBufferSize) + break; + { + ptrdiff_t diff; + curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; + diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) + { + const Byte *c = cur; + while (*c == c[diff]) + { + if (++c == lim) + { + d[0] = (UInt32)(lim - cur); + d[1] = delta - 1; + return d + 2; + } + } + { + const unsigned len = (unsigned)(c - cur); + if (maxLen < len) + { + maxLen = len; + d[0] = (UInt32)len; + d[1] = delta - 1; + d += 2; + } + } + } + } + } + while (--cutValue); + + return d; +} + + +Z7_FORCE_INLINE +UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, + UInt32 *d, UInt32 maxLen) +{ + CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + unsigned len0 = 0, len1 = 0; + + UInt32 cmCheck; + + // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } + + cmCheck = (UInt32)(pos - _cyclicBufferSize); + if ((UInt32)pos <= _cyclicBufferSize) + cmCheck = 0; + + if (cmCheck < curMatch) + do + { + const UInt32 delta = pos - curMatch; + { + CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); + const Byte *pb = cur - delta; + unsigned len = (len0 < len1 ? len0 : len1); + const UInt32 pair0 = pair[0]; + if (pb[len] == cur[len]) + { + if (++len != lenLimit && pb[len] == cur[len]) + while (++len != lenLimit) + if (pb[len] != cur[len]) + break; + if (maxLen < len) + { + maxLen = (UInt32)len; + *d++ = (UInt32)len; + *d++ = delta - 1; + if (len == lenLimit) + { + *ptr1 = pair0; + *ptr0 = pair[1]; + return d; + } + } + } + if (pb[len] < cur[len]) + { + *ptr1 = curMatch; + // const UInt32 curMatch2 = pair[1]; + // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } + // curMatch = curMatch2; + curMatch = pair[1]; + ptr1 = pair + 1; + len1 = len; + } + else + { + *ptr0 = curMatch; + curMatch = pair[0]; + ptr0 = pair; + len0 = len; + } + } + } + while(--cutValue && cmCheck < curMatch); + + *ptr0 = *ptr1 = kEmptyHashValue; + return d; +} + + +static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue) +{ + CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + unsigned len0 = 0, len1 = 0; + + UInt32 cmCheck; + + cmCheck = (UInt32)(pos - _cyclicBufferSize); + if ((UInt32)pos <= _cyclicBufferSize) + cmCheck = 0; + + if (// curMatch >= pos || // failure + cmCheck < curMatch) + do + { + const UInt32 delta = pos - curMatch; + { + CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); + const Byte *pb = cur - delta; + unsigned len = (len0 < len1 ? len0 : len1); + if (pb[len] == cur[len]) + { + while (++len != lenLimit) + if (pb[len] != cur[len]) + break; + { + if (len == lenLimit) + { + *ptr1 = pair[0]; + *ptr0 = pair[1]; + return; + } + } + } + if (pb[len] < cur[len]) + { + *ptr1 = curMatch; + curMatch = pair[1]; + ptr1 = pair + 1; + len1 = len; + } + else + { + *ptr0 = curMatch; + curMatch = pair[0]; + ptr0 = pair; + len0 = len; + } + } + } + while(--cutValue && cmCheck < curMatch); + + *ptr0 = *ptr1 = kEmptyHashValue; + return; +} + + +#define MOVE_POS \ + ++p->cyclicBufferPos; \ + p->buffer++; \ + { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } + +#define MOVE_POS_RET MOVE_POS return distances; + +Z7_NO_INLINE +static void MatchFinder_MovePos(CMatchFinder *p) +{ + /* we go here at the end of stream data, when (avail < num_hash_bytes) + We don't update sons[cyclicBufferPos << btMode]. + So (sons) record will contain junk. And we cannot resume match searching + to normal operation, even if we will provide more input data in buffer. + p->sons[p->cyclicBufferPos << p->btMode] = 0; // kEmptyHashValue + if (p->btMode) + p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue + */ + MOVE_POS +} + +#define GET_MATCHES_HEADER2(minLen, ret_op) \ + unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ + lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ + cur = p->buffer; + +#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) +#define SKIP_HEADER(minLen) do { GET_MATCHES_HEADER2(minLen, continue) + +#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue + +#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num); + +#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ + distances = func(MF_PARAMS(p), \ + distances, (UInt32)_maxLen_); MOVE_POS_RET + +#define GET_MATCHES_FOOTER_BT(_maxLen_) \ + GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) + +#define GET_MATCHES_FOOTER_HC(_maxLen_) \ + GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec) + + + +#define UPDATE_maxLen { \ + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \ + const Byte *c = cur + maxLen; \ + const Byte *lim = cur + lenLimit; \ + for (; c != lim; c++) if (*(c + diff) != *c) break; \ + maxLen = (unsigned)(c - cur); } + +static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +{ + GET_MATCHES_HEADER(2) + HASH2_CALC + curMatch = p->hash[hv]; + p->hash[hv] = p->pos; + GET_MATCHES_FOOTER_BT(1) +} + +UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +{ + GET_MATCHES_HEADER(3) + HASH_ZIP_CALC + curMatch = p->hash[hv]; + p->hash[hv] = p->pos; + GET_MATCHES_FOOTER_BT(2) +} + + +#define SET_mmm \ + mmm = p->cyclicBufferSize; \ + if (pos < mmm) \ + mmm = pos; + + +static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +{ + UInt32 mmm; + UInt32 h2, d2, pos; + unsigned maxLen; + UInt32 *hash; + GET_MATCHES_HEADER(3) + + HASH3_CALC + + hash = p->hash; + pos = p->pos; + + d2 = pos - hash[h2]; + + curMatch = (hash + kFix3HashSize)[hv]; + + hash[h2] = pos; + (hash + kFix3HashSize)[hv] = pos; + + SET_mmm + + maxLen = 2; + + if (d2 < mmm && *(cur - d2) == *cur) + { + UPDATE_maxLen + distances[0] = (UInt32)maxLen; + distances[1] = d2 - 1; + distances += 2; + if (maxLen == lenLimit) + { + SkipMatchesSpec(MF_PARAMS(p)); + MOVE_POS_RET + } + } + + GET_MATCHES_FOOTER_BT(maxLen) +} + + +static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +{ + UInt32 mmm; + UInt32 h2, h3, d2, d3, pos; + unsigned maxLen; + UInt32 *hash; + GET_MATCHES_HEADER(4) + + HASH4_CALC + + hash = p->hash; + pos = p->pos; + + d2 = pos - hash [h2]; + d3 = pos - (hash + kFix3HashSize)[h3]; + curMatch = (hash + kFix4HashSize)[hv]; + + hash [h2] = pos; + (hash + kFix3HashSize)[h3] = pos; + (hash + kFix4HashSize)[hv] = pos; + + SET_mmm + + maxLen = 3; + + for (;;) + { + if (d2 < mmm && *(cur - d2) == *cur) + { + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + // distances[-2] = 3; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; + + UPDATE_maxLen + distances[-2] = (UInt32)maxLen; + if (maxLen == lenLimit) + { + SkipMatchesSpec(MF_PARAMS(p)); + MOVE_POS_RET + } + break; + } + + GET_MATCHES_FOOTER_BT(maxLen) +} + + +static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +{ + UInt32 mmm; + UInt32 h2, h3, d2, d3, maxLen, pos; + UInt32 *hash; + GET_MATCHES_HEADER(5) + + HASH5_CALC + + hash = p->hash; + pos = p->pos; + + d2 = pos - hash [h2]; + d3 = pos - (hash + kFix3HashSize)[h3]; + // d4 = pos - (hash + kFix4HashSize)[h4]; + + curMatch = (hash + kFix5HashSize)[hv]; + + hash [h2] = pos; + (hash + kFix3HashSize)[h3] = pos; + // (hash + kFix4HashSize)[h4] = pos; + (hash + kFix5HashSize)[hv] = pos; + + SET_mmm + + maxLen = 4; + + for (;;) + { + if (d2 < mmm && *(cur - d2) == *cur) + { + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; + d2 = d3; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; + d2 = d3; + } + else + break; + + distances[-2] = 3; + if (*(cur - d2 + 3) != cur[3]) + break; + UPDATE_maxLen + distances[-2] = (UInt32)maxLen; + if (maxLen == lenLimit) + { + SkipMatchesSpec(MF_PARAMS(p)); + MOVE_POS_RET + } + break; + } + + GET_MATCHES_FOOTER_BT(maxLen) +} + + +static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +{ + UInt32 mmm; + UInt32 h2, h3, d2, d3, pos; + unsigned maxLen; + UInt32 *hash; + GET_MATCHES_HEADER(4) + + HASH4_CALC + + hash = p->hash; + pos = p->pos; + + d2 = pos - hash [h2]; + d3 = pos - (hash + kFix3HashSize)[h3]; + curMatch = (hash + kFix4HashSize)[hv]; + + hash [h2] = pos; + (hash + kFix3HashSize)[h3] = pos; + (hash + kFix4HashSize)[hv] = pos; + + SET_mmm + + maxLen = 3; + + for (;;) + { + if (d2 < mmm && *(cur - d2) == *cur) + { + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + // distances[-2] = 3; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; + + UPDATE_maxLen + distances[-2] = (UInt32)maxLen; + if (maxLen == lenLimit) + { + p->son[p->cyclicBufferPos] = curMatch; + MOVE_POS_RET + } + break; + } + + GET_MATCHES_FOOTER_HC(maxLen) +} + + +static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +{ + UInt32 mmm; + UInt32 h2, h3, d2, d3, maxLen, pos; + UInt32 *hash; + GET_MATCHES_HEADER(5) + + HASH5_CALC + + hash = p->hash; + pos = p->pos; + + d2 = pos - hash [h2]; + d3 = pos - (hash + kFix3HashSize)[h3]; + // d4 = pos - (hash + kFix4HashSize)[h4]; + + curMatch = (hash + kFix5HashSize)[hv]; + + hash [h2] = pos; + (hash + kFix3HashSize)[h3] = pos; + // (hash + kFix4HashSize)[h4] = pos; + (hash + kFix5HashSize)[hv] = pos; + + SET_mmm + + maxLen = 4; + + for (;;) + { + if (d2 < mmm && *(cur - d2) == *cur) + { + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; + d2 = d3; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; + d2 = d3; + } + else + break; + + distances[-2] = 3; + if (*(cur - d2 + 3) != cur[3]) + break; + UPDATE_maxLen + distances[-2] = maxLen; + if (maxLen == lenLimit) + { + p->son[p->cyclicBufferPos] = curMatch; + MOVE_POS_RET + } + break; + } + + GET_MATCHES_FOOTER_HC(maxLen) +} + + +UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +{ + GET_MATCHES_HEADER(3) + HASH_ZIP_CALC + curMatch = p->hash[hv]; + p->hash[hv] = p->pos; + GET_MATCHES_FOOTER_HC(2) +} + + +static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +{ + SKIP_HEADER(2) + { + HASH2_CALC + curMatch = p->hash[hv]; + p->hash[hv] = p->pos; + } + SKIP_FOOTER +} + +void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +{ + SKIP_HEADER(3) + { + HASH_ZIP_CALC + curMatch = p->hash[hv]; + p->hash[hv] = p->pos; + } + SKIP_FOOTER +} + +static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +{ + SKIP_HEADER(3) + { + UInt32 h2; + UInt32 *hash; + HASH3_CALC + hash = p->hash; + curMatch = (hash + kFix3HashSize)[hv]; + hash[h2] = + (hash + kFix3HashSize)[hv] = p->pos; + } + SKIP_FOOTER +} + +static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +{ + SKIP_HEADER(4) + { + UInt32 h2, h3; + UInt32 *hash; + HASH4_CALC + hash = p->hash; + curMatch = (hash + kFix4HashSize)[hv]; + hash [h2] = + (hash + kFix3HashSize)[h3] = + (hash + kFix4HashSize)[hv] = p->pos; + } + SKIP_FOOTER +} + +static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +{ + SKIP_HEADER(5) + { + UInt32 h2, h3; + UInt32 *hash; + HASH5_CALC + hash = p->hash; + curMatch = (hash + kFix5HashSize)[hv]; + hash [h2] = + (hash + kFix3HashSize)[h3] = + // (hash + kFix4HashSize)[h4] = + (hash + kFix5HashSize)[hv] = p->pos; + } + SKIP_FOOTER +} + + +#define HC_SKIP_HEADER(minLen) \ + do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \ + const Byte *cur; \ + UInt32 *hash; \ + UInt32 *son; \ + UInt32 pos = p->pos; \ + UInt32 num2 = num; \ + /* (p->pos == p->posLimit) is not allowed here !!! */ \ + { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \ + num -= num2; \ + { const UInt32 cycPos = p->cyclicBufferPos; \ + son = p->son + cycPos; \ + p->cyclicBufferPos = cycPos + num2; } \ + cur = p->buffer; \ + hash = p->hash; \ + do { \ + UInt32 curMatch; \ + UInt32 hv; + + +#define HC_SKIP_FOOTER \ + cur++; pos++; *son++ = curMatch; \ + } while (--num2); \ + p->buffer = cur; \ + p->pos = pos; \ + if (pos == p->posLimit) MatchFinder_CheckLimits(p); \ + }} while(num); \ + + +static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +{ + HC_SKIP_HEADER(4) + + UInt32 h2, h3; + HASH4_CALC + curMatch = (hash + kFix4HashSize)[hv]; + hash [h2] = + (hash + kFix3HashSize)[h3] = + (hash + kFix4HashSize)[hv] = pos; + + HC_SKIP_FOOTER +} + + +static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +{ + HC_SKIP_HEADER(5) + + UInt32 h2, h3; + HASH5_CALC + curMatch = (hash + kFix5HashSize)[hv]; + hash [h2] = + (hash + kFix3HashSize)[h3] = + // (hash + kFix4HashSize)[h4] = + (hash + kFix5HashSize)[hv] = pos; + + HC_SKIP_FOOTER +} + + +void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +{ + HC_SKIP_HEADER(3) + + HASH_ZIP_CALC + curMatch = hash[hv]; + hash[hv] = pos; + + HC_SKIP_FOOTER +} + + +void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) +{ + vTable->Init = (Mf_Init_Func)MatchFinder_Init; + vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; + vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; + if (!p->btMode) + { + if (p->numHashBytes <= 4) + { + vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; + vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; + } + else + { + vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches; + vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip; + } + } + else if (p->numHashBytes == 2) + { + vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches; + vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip; + } + else if (p->numHashBytes == 3) + { + vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; + vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; + } + else if (p->numHashBytes == 4) + { + vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; + vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; + } + else + { + vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches; + vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip; + } +} + + + +void LzFindPrepare(void) +{ + #ifndef FORCE_LZFIND_SATUR_SUB_128 + #ifdef USE_LZFIND_SATUR_SUB_128 + LZFIND_SATUR_SUB_CODE_FUNC f = NULL; + #ifdef MY_CPU_ARM_OR_ARM64 + { + if (CPU_IsSupported_NEON()) + { + // #pragma message ("=== LzFind NEON") + PRF(printf("\n=== LzFind NEON\n")); + f = LzFind_SaturSub_128; + } + // f = 0; // for debug + } + #else // MY_CPU_ARM_OR_ARM64 + if (CPU_IsSupported_SSE41()) + { + // #pragma message ("=== LzFind SSE41") + PRF(printf("\n=== LzFind SSE41\n")); + f = LzFind_SaturSub_128; + + #ifdef USE_LZFIND_SATUR_SUB_256 + if (CPU_IsSupported_AVX2()) + { + // #pragma message ("=== LzFind AVX2") + PRF(printf("\n=== LzFind AVX2\n")); + f = LzFind_SaturSub_256; + } + #endif + } + #endif // MY_CPU_ARM_OR_ARM64 + g_LzFind_SaturSub = f; + #endif // USE_LZFIND_SATUR_SUB_128 + #endif // FORCE_LZFIND_SATUR_SUB_128 +} + + +#undef MOVE_POS +#undef MOVE_POS_RET +#undef PRF |