diff options
Diffstat (limited to 'src/Common/lzma/LzFind.c')
-rw-r--r-- | src/Common/lzma/LzFind.c | 519 |
1 files changed, 304 insertions, 215 deletions
diff --git a/src/Common/lzma/LzFind.c b/src/Common/lzma/LzFind.c index 1b73c284..0fbd5aae 100644 --- a/src/Common/lzma/LzFind.c +++ b/src/Common/lzma/LzFind.c @@ -1,5 +1,5 @@ /* LzFind.c -- Match finder for LZ algorithms -2021-11-29 : Igor Pavlov : Public domain */ +2023-03-14 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -17,7 +17,7 @@ #define kEmptyHashValue 0 #define kMaxValForNormalize ((UInt32)0) -// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xFFF) // for debug +// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug // #define kNormalizeAlign (1 << 7) // alignment for speculated accesses @@ -67,10 +67,10 @@ static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc) { - if (!p->directInput) + // if (!p->directInput) { - ISzAlloc_Free(alloc, p->bufferBase); - p->bufferBase = NULL; + ISzAlloc_Free(alloc, p->bufBase); + p->bufBase = NULL; } } @@ -79,7 +79,7 @@ static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr all { if (blockSize == 0) return 0; - if (!p->bufferBase || p->blockSize != blockSize) + if (!p->bufBase || p->blockSize != blockSize) { // size_t blockSizeT; LzInWindow_Free(p, alloc); @@ -101,11 +101,11 @@ static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr all #endif */ - p->bufferBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize); - // printf("\nbufferBase = %p\n", p->bufferBase); + p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize); + // printf("\nbufferBase = %p\n", p->bufBase); // return 0; // for debug } - return (p->bufferBase != NULL); + return (p->bufBase != NULL); } static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } @@ -113,7 +113,7 @@ static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } -MY_NO_INLINE +Z7_NO_INLINE static void MatchFinder_ReadBlock(CMatchFinder *p) { if (p->streamEndWasReached || p->result != SZ_OK) @@ -127,8 +127,8 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p); if (curSize > p->directInputRem) curSize = (UInt32)p->directInputRem; - p->directInputRem -= curSize; p->streamPos += curSize; + p->directInputRem -= curSize; if (p->directInputRem == 0) p->streamEndWasReached = 1; return; @@ -136,8 +136,8 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) for (;;) { - Byte *dest = p->buffer + GET_AVAIL_BYTES(p); - size_t size = (size_t)(p->bufferBase + p->blockSize - dest); + const Byte *dest = p->buffer + GET_AVAIL_BYTES(p); + size_t size = (size_t)(p->bufBase + p->blockSize - dest); if (size == 0) { /* we call ReadBlock() after NeedMove() and MoveBlock(). @@ -153,7 +153,14 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) // #define kRead 3 // if (size > kRead) size = kRead; // for debug - p->result = ISeqInStream_Read(p->stream, dest, &size); + /* + // we need cast (Byte *)dest. + #ifdef __clang__ + #pragma GCC diagnostic ignored "-Wcast-qual" + #endif + */ + p->result = ISeqInStream_Read(p->stream, + p->bufBase + (dest - p->bufBase), &size); if (p->result != SZ_OK) return; if (size == 0) @@ -173,14 +180,14 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) -MY_NO_INLINE +Z7_NO_INLINE void MatchFinder_MoveBlock(CMatchFinder *p) { - const size_t offset = (size_t)(p->buffer - p->bufferBase) - p->keepSizeBefore; + const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore; const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore; - p->buffer = p->bufferBase + keepBefore; - memmove(p->bufferBase, - p->bufferBase + (offset & ~((size_t)kBlockMoveAlign - 1)), + p->buffer = p->bufBase + keepBefore; + memmove(p->bufBase, + p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)), keepBefore + (size_t)GET_AVAIL_BYTES(p)); } @@ -198,7 +205,7 @@ int MatchFinder_NeedMove(CMatchFinder *p) return 0; if (p->streamEndWasReached || p->result != SZ_OK) return 0; - return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter); + return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter); } void MatchFinder_ReadIfRequired(CMatchFinder *p) @@ -214,6 +221,8 @@ static void MatchFinder_SetDefaultSettings(CMatchFinder *p) p->cutValue = 32; p->btMode = 1; p->numHashBytes = 4; + p->numHashBytes_Min = 2; + p->numHashOutBits = 0; p->bigHash = 0; } @@ -222,8 +231,10 @@ static void MatchFinder_SetDefaultSettings(CMatchFinder *p) void MatchFinder_Construct(CMatchFinder *p) { unsigned i; - p->bufferBase = NULL; + p->buffer = NULL; + p->bufBase = NULL; p->directInput = 0; + p->stream = NULL; p->hash = NULL; p->expectedDataSize = (UInt64)(Int64)-1; MatchFinder_SetDefaultSettings(p); @@ -238,6 +249,8 @@ void MatchFinder_Construct(CMatchFinder *p) } } +#undef kCrcPoly + static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->hash); @@ -252,7 +265,7 @@ void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc) static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc) { - size_t sizeInBytes = (size_t)num * sizeof(CLzRef); + const size_t sizeInBytes = (size_t)num * sizeof(CLzRef); if (sizeInBytes / sizeof(CLzRef) != num) return NULL; return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes); @@ -298,6 +311,62 @@ static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize) } +// input is historySize +static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs) +{ + if (p->numHashBytes == 2) + return (1 << 16) - 1; + if (hs != 0) + hs--; + hs |= (hs >> 1); + hs |= (hs >> 2); + hs |= (hs >> 4); + hs |= (hs >> 8); + // we propagated 16 bits in (hs). Low 16 bits must be set later + if (hs >= (1 << 24)) + { + if (p->numHashBytes == 3) + hs = (1 << 24) - 1; + /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ + } + // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) + hs |= (1 << 16) - 1; /* don't change it! */ + // bt5: we adjust the size with recommended minimum size + if (p->numHashBytes >= 5) + hs |= (256 << kLzHash_CrcShift_2) - 1; + return hs; +} + +// input is historySize +static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs) +{ + if (p->numHashBytes == 2) + return (1 << 16) - 1; + if (hs != 0) + hs--; + hs |= (hs >> 1); + hs |= (hs >> 2); + hs |= (hs >> 4); + hs |= (hs >> 8); + // we propagated 16 bits in (hs). Low 16 bits must be set later + hs >>= 1; + if (hs >= (1 << 24)) + { + if (p->numHashBytes == 3) + hs = (1 << 24) - 1; + else + hs >>= 1; + /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ + } + // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) + hs |= (1 << 16) - 1; /* don't change it! */ + // bt5: we adjust the size with recommended minimum size + if (p->numHashBytes >= 5) + hs |= (256 << kLzHash_CrcShift_2) - 1; + return hs; +} + + int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc) @@ -318,78 +387,91 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, p->blockSize = 0; if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc)) { - const UInt32 newCyclicBufferSize = historySize + 1; // do not change it - UInt32 hs; - p->matchMaxLen = matchMaxLen; + size_t hashSizeSum; { - // UInt32 hs4; - p->fixedHashSize = 0; - hs = (1 << 16) - 1; - if (p->numHashBytes != 2) + UInt32 hs; + UInt32 hsCur; + + if (p->numHashOutBits != 0) { - hs = historySize; - if (hs > p->expectedDataSize) - hs = (UInt32)p->expectedDataSize; - if (hs != 0) - hs--; - hs |= (hs >> 1); - hs |= (hs >> 2); - hs |= (hs >> 4); - hs |= (hs >> 8); - // we propagated 16 bits in (hs). Low 16 bits must be set later - hs >>= 1; - if (hs >= (1 << 24)) - { - if (p->numHashBytes == 3) - hs = (1 << 24) - 1; - else - hs >>= 1; - /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ - } - - // hs = ((UInt32)1 << 25) - 1; // for test - + unsigned numBits = p->numHashOutBits; + const unsigned nbMax = + (p->numHashBytes == 2 ? 16 : + (p->numHashBytes == 3 ? 24 : 32)); + if (numBits > nbMax) + numBits = nbMax; + if (numBits >= 32) + hs = (UInt32)0 - 1; + else + hs = ((UInt32)1 << numBits) - 1; // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) hs |= (1 << 16) - 1; /* don't change it! */ - - // bt5: we adjust the size with recommended minimum size if (p->numHashBytes >= 5) hs |= (256 << kLzHash_CrcShift_2) - 1; + { + const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); + if (hs > hs2) + hs = hs2; + } + hsCur = hs; + if (p->expectedDataSize < historySize) + { + const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); + if (hsCur > hs2) + hsCur = hs2; + } } - p->hashMask = hs; - hs++; - - /* - hs4 = (1 << 20); - if (hs4 > hs) - hs4 = hs; - // hs4 = (1 << 16); // for test - p->hash4Mask = hs4 - 1; - */ + else + { + hs = MatchFinder_GetHashMask(p, historySize); + hsCur = hs; + if (p->expectedDataSize < historySize) + { + hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); + if (hsCur > hs) // is it possible? + hsCur = hs; + } + } + + p->hashMask = hsCur; - if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size; - if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size; - // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size; - hs += p->fixedHashSize; + hashSizeSum = hs; + hashSizeSum++; + if (hashSizeSum < hs) + return 0; + { + UInt32 fixedHashSize = 0; + if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size; + if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size; + // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size; + hashSizeSum += fixedHashSize; + p->fixedHashSize = fixedHashSize; + } } + p->matchMaxLen = matchMaxLen; + { size_t newSize; size_t numSons; + const UInt32 newCyclicBufferSize = historySize + 1; // do not change it p->historySize = historySize; - p->hashSizeSum = hs; p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1) numSons = newCyclicBufferSize; if (p->btMode) numSons <<= 1; - newSize = hs + numSons; + newSize = hashSizeSum + numSons; + + if (numSons < newCyclicBufferSize || newSize < numSons) + return 0; // aligned size is not required here, but it can be better for some loops #define NUM_REFS_ALIGN_MASK 0xF newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK; - if (p->hash && p->numRefs == newSize) + // 22.02: we don't reallocate buffer, if old size is enough + if (p->hash && p->numRefs >= newSize) return 1; MatchFinder_FreeThisClassMemory(p, alloc); @@ -398,7 +480,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, if (p->hash) { - p->son = p->hash + p->hashSizeSum; + p->son = p->hash + hashSizeSum; return 1; } } @@ -470,7 +552,8 @@ void MatchFinder_Init_HighHash(CMatchFinder *p) void MatchFinder_Init_4(CMatchFinder *p) { - p->buffer = p->bufferBase; + if (!p->directInput) + p->buffer = p->bufBase; { /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker. the code in CMatchFinderMt expects (pos = 1) */ @@ -507,20 +590,20 @@ void MatchFinder_Init(CMatchFinder *p) #ifdef MY_CPU_X86_OR_AMD64 - #if defined(__clang__) && (__clang_major__ >= 8) \ - || defined(__GNUC__) && (__GNUC__ >= 8) \ - || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) - #define USE_SATUR_SUB_128 - #define USE_AVX2 - #define ATTRIB_SSE41 __attribute__((__target__("sse4.1"))) - #define ATTRIB_AVX2 __attribute__((__target__("avx2"))) + #if defined(__clang__) && (__clang_major__ >= 4) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701) + // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) + + #define USE_LZFIND_SATUR_SUB_128 + #define USE_LZFIND_SATUR_SUB_256 + #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1"))) + #define LZFIND_ATTRIB_AVX2 __attribute__((__target__("avx2"))) #elif defined(_MSC_VER) #if (_MSC_VER >= 1600) - #define USE_SATUR_SUB_128 - #if (_MSC_VER >= 1900) - #define USE_AVX2 - #include <immintrin.h> // avx - #endif + #define USE_LZFIND_SATUR_SUB_128 + #endif + #if (_MSC_VER >= 1900) + #define USE_LZFIND_SATUR_SUB_256 #endif #endif @@ -529,16 +612,16 @@ void MatchFinder_Init(CMatchFinder *p) #if defined(__clang__) && (__clang_major__ >= 8) \ || defined(__GNUC__) && (__GNUC__ >= 8) - #define USE_SATUR_SUB_128 + #define USE_LZFIND_SATUR_SUB_128 #ifdef MY_CPU_ARM64 - // #define ATTRIB_SSE41 __attribute__((__target__(""))) + // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) #else - // #define ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) + // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) #endif #elif defined(_MSC_VER) #if (_MSC_VER >= 1910) - #define USE_SATUR_SUB_128 + #define USE_LZFIND_SATUR_SUB_128 #endif #endif @@ -550,121 +633,130 @@ void MatchFinder_Init(CMatchFinder *p) #endif -/* -#ifndef ATTRIB_SSE41 - #define ATTRIB_SSE41 -#endif -#ifndef ATTRIB_AVX2 - #define ATTRIB_AVX2 -#endif -*/ -#ifdef USE_SATUR_SUB_128 +#ifdef USE_LZFIND_SATUR_SUB_128 -// #define _SHOW_HW_STATUS +// #define Z7_SHOW_HW_STATUS -#ifdef _SHOW_HW_STATUS +#ifdef Z7_SHOW_HW_STATUS #include <stdio.h> -#define _PRF(x) x -_PRF(;) +#define PRF(x) x +PRF(;) #else -#define _PRF(x) +#define PRF(x) #endif + #ifdef MY_CPU_ARM_OR_ARM64 #ifdef MY_CPU_ARM64 -// #define FORCE_SATUR_SUB_128 +// #define FORCE_LZFIND_SATUR_SUB_128 #endif +typedef uint32x4_t LzFind_v128; +#define SASUB_128_V(v, s) \ + vsubq_u32(vmaxq_u32(v, s), s) -typedef uint32x4_t v128; -#define SASUB_128(i) \ - *(v128 *)(void *)(items + (i) * 4) = \ - vsubq_u32(vmaxq_u32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2); - -#else +#else // MY_CPU_ARM_OR_ARM64 #include <smmintrin.h> // sse4.1 -typedef __m128i v128; -#define SASUB_128(i) \ - *(v128 *)(void *)(items + (i) * 4) = \ - _mm_sub_epi32(_mm_max_epu32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2); // SSE 4.1 +typedef __m128i LzFind_v128; +// SSE 4.1 +#define SASUB_128_V(v, s) \ + _mm_sub_epi32(_mm_max_epu32(v, s), s) -#endif +#endif // MY_CPU_ARM_OR_ARM64 +#define SASUB_128(i) \ + *( LzFind_v128 *)( void *)(items + (i) * 4) = SASUB_128_V( \ + *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2); + -MY_NO_INLINE +Z7_NO_INLINE static -#ifdef ATTRIB_SSE41 -ATTRIB_SSE41 +#ifdef LZFIND_ATTRIB_SSE41 +LZFIND_ATTRIB_SSE41 #endif void -MY_FAST_CALL +Z7_FASTCALL LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim) { - v128 sub2 = + const LzFind_v128 sub2 = #ifdef MY_CPU_ARM_OR_ARM64 vdupq_n_u32(subValue); #else _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); #endif + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE do { - SASUB_128(0) - SASUB_128(1) - SASUB_128(2) - SASUB_128(3) - items += 4 * 4; + SASUB_128(0) SASUB_128(1) items += 2 * 4; + SASUB_128(0) SASUB_128(1) items += 2 * 4; } while (items != lim); } -#ifdef USE_AVX2 +#ifdef USE_LZFIND_SATUR_SUB_256 #include <immintrin.h> // avx +/* +clang :immintrin.h uses +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX2__) +#include <avx2intrin.h> +#endif +so we need <avxintrin.h> for clang-cl */ -#define SASUB_256(i) *(__m256i *)(void *)(items + (i) * 8) = _mm256_sub_epi32(_mm256_max_epu32(*(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); // AVX2 +#if defined(__clang__) +#include <avxintrin.h> +#include <avx2intrin.h> +#endif -MY_NO_INLINE +// AVX2: +#define SASUB_256(i) \ + *( __m256i *)( void *)(items + (i) * 8) = \ + _mm256_sub_epi32(_mm256_max_epu32( \ + *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); + +Z7_NO_INLINE static -#ifdef ATTRIB_AVX2 -ATTRIB_AVX2 +#ifdef LZFIND_ATTRIB_AVX2 +LZFIND_ATTRIB_AVX2 #endif void -MY_FAST_CALL +Z7_FASTCALL LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim) { - __m256i sub2 = _mm256_set_epi32( + const __m256i sub2 = _mm256_set_epi32( (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE do { - SASUB_256(0) - SASUB_256(1) - items += 2 * 8; + SASUB_256(0) SASUB_256(1) items += 2 * 8; + SASUB_256(0) SASUB_256(1) items += 2 * 8; } while (items != lim); } -#endif // USE_AVX2 +#endif // USE_LZFIND_SATUR_SUB_256 -#ifndef FORCE_SATUR_SUB_128 -typedef void (MY_FAST_CALL *LZFIND_SATUR_SUB_CODE_FUNC)( +#ifndef FORCE_LZFIND_SATUR_SUB_128 +typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)( UInt32 subValue, CLzRef *items, const CLzRef *lim); static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub; -#endif // FORCE_SATUR_SUB_128 +#endif // FORCE_LZFIND_SATUR_SUB_128 -#endif // USE_SATUR_SUB_128 +#endif // USE_LZFIND_SATUR_SUB_128 // kEmptyHashValue must be zero -// #define SASUB_32(i) v = items[i]; m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; -#define SASUB_32(i) v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; +// #define SASUB_32(i) { UInt32 v = items[i]; UInt32 m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; } +#define SASUB_32(i) { UInt32 v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; } -#ifdef FORCE_SATUR_SUB_128 +#ifdef FORCE_LZFIND_SATUR_SUB_128 #define DEFAULT_SaturSub LzFind_SaturSub_128 @@ -672,24 +764,19 @@ static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub; #define DEFAULT_SaturSub LzFind_SaturSub_32 -MY_NO_INLINE +Z7_NO_INLINE static void -MY_FAST_CALL +Z7_FASTCALL LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim) { + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE do { - UInt32 v; - SASUB_32(0) - SASUB_32(1) - SASUB_32(2) - SASUB_32(3) - SASUB_32(4) - SASUB_32(5) - SASUB_32(6) - SASUB_32(7) - items += 8; + SASUB_32(0) SASUB_32(1) items += 2; + SASUB_32(0) SASUB_32(1) items += 2; + SASUB_32(0) SASUB_32(1) items += 2; + SASUB_32(0) SASUB_32(1) items += 2; } while (items != lim); } @@ -697,27 +784,23 @@ LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim) #endif -MY_NO_INLINE +Z7_NO_INLINE void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) { - #define K_NORM_ALIGN_BLOCK_SIZE (1 << 6) - - CLzRef *lim; - - for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (K_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--) + #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7) + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--) { - UInt32 v; - SASUB_32(0); + SASUB_32(0) items++; } - { - #define K_NORM_ALIGN_MASK (K_NORM_ALIGN_BLOCK_SIZE / 4 - 1) - lim = items + (numItems & ~(size_t)K_NORM_ALIGN_MASK); - numItems &= K_NORM_ALIGN_MASK; + const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1); + CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask); + numItems &= k_Align_Mask; if (items != lim) { - #if defined(USE_SATUR_SUB_128) && !defined(FORCE_SATUR_SUB_128) + #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128) if (g_LzFind_SaturSub) g_LzFind_SaturSub(subValue, items, lim); else @@ -726,12 +809,10 @@ void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) } items = lim; } - - + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE for (; numItems != 0; numItems--) { - UInt32 v; - SASUB_32(0); + SASUB_32(0) items++; } } @@ -740,7 +821,7 @@ void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) // call MatchFinder_CheckLimits() only after (p->pos++) update -MY_NO_INLINE +Z7_NO_INLINE static void MatchFinder_CheckLimits(CMatchFinder *p) { if (// !p->streamEndWasReached && p->result == SZ_OK && @@ -768,11 +849,14 @@ static void MatchFinder_CheckLimits(CMatchFinder *p) const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */; // const UInt32 subValue = (1 << 15); // for debug // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue); - size_t numSonRefs = p->cyclicBufferSize; - if (p->btMode) - numSonRefs <<= 1; - Inline_MatchFinder_ReduceOffsets(p, subValue); - MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashSizeSum + numSonRefs); + MatchFinder_REDUCE_OFFSETS(p, subValue) + MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize); + { + size_t numSonRefs = p->cyclicBufferSize; + if (p->btMode) + numSonRefs <<= 1; + MatchFinder_Normalize3(subValue, p->son, numSonRefs); + } } if (p->cyclicBufferPos == p->cyclicBufferSize) @@ -785,7 +869,7 @@ static void MatchFinder_CheckLimits(CMatchFinder *p) /* (lenLimit > maxLen) */ -MY_FORCE_INLINE +Z7_FORCE_INLINE static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, UInt32 *d, unsigned maxLen) @@ -867,7 +951,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, } -MY_FORCE_INLINE +Z7_FORCE_INLINE UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, UInt32 *d, UInt32 maxLen) @@ -1004,7 +1088,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const #define MOVE_POS_RET MOVE_POS return distances; -MY_NO_INLINE +Z7_NO_INLINE static void MatchFinder_MovePos(CMatchFinder *p) { /* we go here at the end of stream data, when (avail < num_hash_bytes) @@ -1015,11 +1099,11 @@ static void MatchFinder_MovePos(CMatchFinder *p) if (p->btMode) p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue */ - MOVE_POS; + MOVE_POS } #define GET_MATCHES_HEADER2(minLen, ret_op) \ - unsigned lenLimit; UInt32 hv; Byte *cur; UInt32 curMatch; \ + unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ cur = p->buffer; @@ -1028,11 +1112,11 @@ static void MatchFinder_MovePos(CMatchFinder *p) #define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue -#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS; } while (--num); +#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num); #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ distances = func(MF_PARAMS(p), \ - distances, (UInt32)_maxLen_); MOVE_POS_RET; + distances, (UInt32)_maxLen_); MOVE_POS_RET #define GET_MATCHES_FOOTER_BT(_maxLen_) \ GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) @@ -1052,7 +1136,7 @@ static void MatchFinder_MovePos(CMatchFinder *p) static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { GET_MATCHES_HEADER(2) - HASH2_CALC; + HASH2_CALC curMatch = p->hash[hv]; p->hash[hv] = p->pos; GET_MATCHES_FOOTER_BT(1) @@ -1061,7 +1145,7 @@ static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { GET_MATCHES_HEADER(3) - HASH_ZIP_CALC; + HASH_ZIP_CALC curMatch = p->hash[hv]; p->hash[hv] = p->pos; GET_MATCHES_FOOTER_BT(2) @@ -1082,7 +1166,7 @@ static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) UInt32 *hash; GET_MATCHES_HEADER(3) - HASH3_CALC; + HASH3_CALC hash = p->hash; pos = p->pos; @@ -1107,7 +1191,7 @@ static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) if (maxLen == lenLimit) { SkipMatchesSpec(MF_PARAMS(p)); - MOVE_POS_RET; + MOVE_POS_RET } } @@ -1123,7 +1207,7 @@ static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) UInt32 *hash; GET_MATCHES_HEADER(4) - HASH4_CALC; + HASH4_CALC hash = p->hash; pos = p->pos; @@ -1190,7 +1274,7 @@ static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) UInt32 *hash; GET_MATCHES_HEADER(5) - HASH5_CALC; + HASH5_CALC hash = p->hash; pos = p->pos; @@ -1246,7 +1330,7 @@ static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) if (maxLen == lenLimit) { SkipMatchesSpec(MF_PARAMS(p)); - MOVE_POS_RET; + MOVE_POS_RET } break; } @@ -1263,7 +1347,7 @@ static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) UInt32 *hash; GET_MATCHES_HEADER(4) - HASH4_CALC; + HASH4_CALC hash = p->hash; pos = p->pos; @@ -1314,12 +1398,12 @@ static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) if (maxLen == lenLimit) { p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS_RET; + MOVE_POS_RET } break; } - GET_MATCHES_FOOTER_HC(maxLen); + GET_MATCHES_FOOTER_HC(maxLen) } @@ -1330,7 +1414,7 @@ static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) UInt32 *hash; GET_MATCHES_HEADER(5) - HASH5_CALC; + HASH5_CALC hash = p->hash; pos = p->pos; @@ -1386,19 +1470,19 @@ static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) if (maxLen == lenLimit) { p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS_RET; + MOVE_POS_RET } break; } - GET_MATCHES_FOOTER_HC(maxLen); + GET_MATCHES_FOOTER_HC(maxLen) } UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { GET_MATCHES_HEADER(3) - HASH_ZIP_CALC; + HASH_ZIP_CALC curMatch = p->hash[hv]; p->hash[hv] = p->pos; GET_MATCHES_FOOTER_HC(2) @@ -1409,7 +1493,7 @@ static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { SKIP_HEADER(2) { - HASH2_CALC; + HASH2_CALC curMatch = p->hash[hv]; p->hash[hv] = p->pos; } @@ -1420,7 +1504,7 @@ void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { SKIP_HEADER(3) { - HASH_ZIP_CALC; + HASH_ZIP_CALC curMatch = p->hash[hv]; p->hash[hv] = p->pos; } @@ -1433,7 +1517,7 @@ static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { UInt32 h2; UInt32 *hash; - HASH3_CALC; + HASH3_CALC hash = p->hash; curMatch = (hash + kFix3HashSize)[hv]; hash[h2] = @@ -1448,7 +1532,7 @@ static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { UInt32 h2, h3; UInt32 *hash; - HASH4_CALC; + HASH4_CALC hash = p->hash; curMatch = (hash + kFix4HashSize)[hv]; hash [h2] = @@ -1464,7 +1548,7 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { UInt32 h2, h3; UInt32 *hash; - HASH5_CALC; + HASH5_CALC hash = p->hash; curMatch = (hash + kFix5HashSize)[hv]; hash [h2] = @@ -1478,7 +1562,7 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) #define HC_SKIP_HEADER(minLen) \ do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \ - Byte *cur; \ + const Byte *cur; \ UInt32 *hash; \ UInt32 *son; \ UInt32 pos = p->pos; \ @@ -1510,7 +1594,7 @@ static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) HC_SKIP_HEADER(4) UInt32 h2, h3; - HASH4_CALC; + HASH4_CALC curMatch = (hash + kFix4HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = @@ -1540,7 +1624,7 @@ void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { HC_SKIP_HEADER(3) - HASH_ZIP_CALC; + HASH_ZIP_CALC curMatch = hash[hv]; hash[hv] = pos; @@ -1590,17 +1674,17 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) -void LzFindPrepare() +void LzFindPrepare(void) { - #ifndef FORCE_SATUR_SUB_128 - #ifdef USE_SATUR_SUB_128 + #ifndef FORCE_LZFIND_SATUR_SUB_128 + #ifdef USE_LZFIND_SATUR_SUB_128 LZFIND_SATUR_SUB_CODE_FUNC f = NULL; #ifdef MY_CPU_ARM_OR_ARM64 { if (CPU_IsSupported_NEON()) { // #pragma message ("=== LzFind NEON") - _PRF(printf("\n=== LzFind NEON\n")); + PRF(printf("\n=== LzFind NEON\n")); f = LzFind_SaturSub_128; } // f = 0; // for debug @@ -1609,20 +1693,25 @@ void LzFindPrepare() if (CPU_IsSupported_SSE41()) { // #pragma message ("=== LzFind SSE41") - _PRF(printf("\n=== LzFind SSE41\n")); + PRF(printf("\n=== LzFind SSE41\n")); f = LzFind_SaturSub_128; - #ifdef USE_AVX2 + #ifdef USE_LZFIND_SATUR_SUB_256 if (CPU_IsSupported_AVX2()) { // #pragma message ("=== LzFind AVX2") - _PRF(printf("\n=== LzFind AVX2\n")); + PRF(printf("\n=== LzFind AVX2\n")); f = LzFind_SaturSub_256; } #endif } #endif // MY_CPU_ARM_OR_ARM64 g_LzFind_SaturSub = f; - #endif // USE_SATUR_SUB_128 - #endif // FORCE_SATUR_SUB_128 + #endif // USE_LZFIND_SATUR_SUB_128 + #endif // FORCE_LZFIND_SATUR_SUB_128 } + + +#undef MOVE_POS +#undef MOVE_POS_RET +#undef PRF |