VeraCrypt
aboutsummaryrefslogtreecommitdiff
path: root/src/Common/libzip/zip_crypto_openssl.c
blob: 7f1da10edbe2e45791201d26ef209df813497466 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
/*
  zip_crypto_openssl.c -- OpenSSL wrapper.
  Copyright (C) 2018-2021 Dieter Baron and Thomas Klausner

  This file is part of libzip, a library to manipulate ZIP archives.
  The authors can be contacted at <info@libzip.org>

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
  1. Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.
  2. Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer in
  the documentation and/or other materials provided with the
  distribution.
  3. The names of the authors may not be used to endorse or promote
  products derived from this software without specific prior
  written permission.

  THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
  IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <stdlib.h>

#include "zipint.h"

#include "zip_crypto.h"

#include <limits.h>
#include <openssl/rand.h>

#ifdef USE_OPENSSL_3_API
static _zip_crypto_hmac_t* hmac_new() {
    _zip_crypto_hmac_t *hmac = (_zip_crypto_hmac_t*)malloc(sizeof(*hmac));
    if (hmac != NULL) {
        hmac->mac = NULL;
        hmac->ctx = NULL;
    }
    return hmac;
}
static void hmac_free(_zip_crypto_hmac_t* hmac) {
    if (hmac != NULL) {
        if (hmac->ctx != NULL) {
            EVP_MAC_CTX_free(hmac->ctx);
        }
        if (hmac->mac != NULL) {
            EVP_MAC_free(hmac->mac);
        }
        free(hmac);
    }
}
#endif

_zip_crypto_aes_t *
_zip_crypto_aes_new(const zip_uint8_t *key, zip_uint16_t key_size, zip_error_t *error) {
    _zip_crypto_aes_t *aes;
    const EVP_CIPHER* cipher_type;

    switch (key_size) {
    case 128:
        cipher_type = EVP_aes_128_ecb();
        break;
    case 192:
        cipher_type = EVP_aes_192_ecb();
        break;
    case 256:
        cipher_type = EVP_aes_256_ecb();
        break;
    default:
        zip_error_set(error, ZIP_ER_INTERNAL, 0);
        return NULL;
    }

#ifdef USE_OPENSSL_1_0_API
    if ((aes = (_zip_crypto_aes_t *)malloc(sizeof(*aes))) == NULL) {
        zip_error_set(error, ZIP_ER_MEMORY, 0);
        return NULL;
    }
    memset(aes, 0, sizeof(*aes));
#else
    if ((aes = EVP_CIPHER_CTX_new()) == NULL) {
        zip_error_set(error, ZIP_ER_MEMORY, 0);
        return NULL;
    }
#endif

    if (EVP_EncryptInit_ex(aes, cipher_type, NULL, key, NULL) != 1) {
#ifdef USE_OPENSSL_1_0_API
        free(aes);
#else
        EVP_CIPHER_CTX_free(aes);
#endif
        zip_error_set(error, ZIP_ER_INTERNAL, 0);
        return NULL;
    }

    return aes;
}

void
_zip_crypto_aes_free(_zip_crypto_aes_t *aes) {
    if (aes == NULL) {
        return;
    }

#ifdef USE_OPENSSL_1_0_API
    EVP_CIPHER_CTX_cleanup(aes);
    _zip_crypto_clear(aes, sizeof(*aes));
    free(aes);
#else
    EVP_CIPHER_CTX_free(aes);
#endif
}


bool
_zip_crypto_aes_encrypt_block(_zip_crypto_aes_t *aes, const zip_uint8_t *in, zip_uint8_t *out) {
    int len;
    if (EVP_EncryptUpdate(aes, out, &len, in, ZIP_CRYPTO_AES_BLOCK_LENGTH) != 1) {
        return false;
    }
    return true;
}


_zip_crypto_hmac_t *
_zip_crypto_hmac_new(const zip_uint8_t *secret, zip_uint64_t secret_length, zip_error_t *error) {
    _zip_crypto_hmac_t *hmac;

    if (secret_length > INT_MAX) {
        zip_error_set(error, ZIP_ER_INVAL, 0);
        return NULL;
    }

#ifdef USE_OPENSSL_3_API
    if ((hmac = hmac_new()) == NULL
        || (hmac->mac = EVP_MAC_fetch(NULL, "HMAC", "provider=default")) == NULL
        || (hmac->ctx = EVP_MAC_CTX_new(hmac->mac)) == NULL) {
        hmac_free(hmac);
        zip_error_set(error, ZIP_ER_MEMORY, 0);
        return NULL;
    }

    {
        OSSL_PARAM params[2];
        params[0] = OSSL_PARAM_construct_utf8_string("digest", "SHA1", 0);
        params[1] = OSSL_PARAM_construct_end();

        if (!EVP_MAC_init(hmac->ctx, (const unsigned char *)secret, secret_length, params)) {
            zip_error_set(error, ZIP_ER_INTERNAL, 0);
            hmac_free(hmac);
            return NULL;
        }
    }
#else
#ifdef USE_OPENSSL_1_0_API
    if ((hmac = (_zip_crypto_hmac_t *)malloc(sizeof(*hmac))) == NULL) {
        zip_error_set(error, ZIP_ER_MEMORY, 0);
        return NULL;
    }

        HMAC_CTX_init(hmac);
#else
    if ((hmac = HMAC_CTX_new()) == NULL) {
        zip_error_set(error, ZIP_ER_MEMORY, 0);
        return NULL;
    }
#endif

    if (HMAC_Init_ex(hmac, secret, (int)secret_length, EVP_sha1(), NULL) != 1) {
        zip_error_set(error, ZIP_ER_INTERNAL, 0);
#ifdef USE_OPENSSL_1_0_API
        free(hmac);
#else
        HMAC_CTX_free(hmac);
#endif
        return NULL;
    }
#endif

    return hmac;
}


void
_zip_crypto_hmac_free(_zip_crypto_hmac_t *hmac) {
    if (hmac == NULL) {
        return;
    }

#if defined(USE_OPENSSL_3_API)
    hmac_free(hmac);
#elif defined(USE_OPENSSL_1_0_API)
    HMAC_CTX_cleanup(hmac);
    _zip_crypto_clear(hmac, sizeof(*hmac));
    free(hmac);
#else
    HMAC_CTX_free(hmac);
#endif
}


bool
_zip_crypto_hmac_output(_zip_crypto_hmac_t *hmac, zip_uint8_t *data) {
#ifdef USE_OPENSSL_3_API
    size_t length;
    return EVP_MAC_final(hmac->ctx, data, &length, ZIP_CRYPTO_SHA1_LENGTH) == 1 && length == ZIP_CRYPTO_SHA1_LENGTH;
#else
    unsigned int length;
    return HMAC_Final(hmac, data, &length) == 1;
#endif
}


ZIP_EXTERN bool
zip_secure_random(zip_uint8_t *buffer, zip_uint16_t length) {
    return RAND_bytes(buffer, length) == 1;
}
pan> * * ACKNOWLEDGEMENT: * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! */ #pragma once #if defined(_MSC_VER) #pragma warning(disable : 4201) /* nameless struct/union */ #if _MSC_VER > 1800 #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif /* 1800 */ #endif /* MSVC */ #include "t1ha.h" #ifndef T1HA_USE_FAST_ONESHOT_READ /* Define it to 1 for little bit faster code. * Unfortunately this may triggering a false-positive alarms from Valgrind, * AddressSanitizer and other similar tool. * So, define it to 0 for calmness if doubt. */ #define T1HA_USE_FAST_ONESHOT_READ 1 #endif /* T1HA_USE_FAST_ONESHOT_READ */ /*****************************************************************************/ #include <assert.h> /* for assert() */ #include <string.h> /* for memcpy() */ #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ #error Unsupported byte order. #endif #define T1HA_UNALIGNED_ACCESS__UNABLE 0 #define T1HA_UNALIGNED_ACCESS__SLOW 1 #define T1HA_UNALIGNED_ACCESS__EFFICIENT 2 #ifndef T1HA_SYS_UNALIGNED_ACCESS #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT #elif defined(__ia32__) #define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT #elif defined(__e2k__) #define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__SLOW #elif defined(__ARM_FEATURE_UNALIGNED) #define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT #else #define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__UNABLE #endif #endif /* T1HA_SYS_UNALIGNED_ACCESS */ #define ALIGNMENT_16 2 #define ALIGNMENT_32 4 #if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul #define ALIGNMENT_64 8 #else #define ALIGNMENT_64 4 #endif #ifndef PAGESIZE #define PAGESIZE 4096 #endif /* PAGESIZE */ /***************************************************************************/ #ifndef __has_builtin #define __has_builtin(x) (0) #endif #ifndef __has_warning #define __has_warning(x) (0) #endif #ifndef __has_feature #define __has_feature(x) (0) #endif #ifndef __has_extension #define __has_extension(x) (0) #endif #ifndef __has_attribute #define __has_attribute(x) (0) #endif #if __has_feature(address_sanitizer) #define __SANITIZE_ADDRESS__ 1 #endif #ifndef __optimize #if defined(__clang__) && !__has_attribute(optimize) #define __optimize(ops) #elif defined(__GNUC__) || __has_attribute(optimize) #define __optimize(ops) __attribute__((optimize(ops))) #else #define __optimize(ops) #endif #endif /* __optimize */ #ifndef __cold #if defined(__OPTIMIZE__) #if defined(__e2k__) #define __cold __optimize(1) __attribute__((cold)) #elif defined(__clang__) && !__has_attribute(cold) /* just put infrequently used functions in separate section */ #define __cold __attribute__((section("text.unlikely"))) __optimize("Os") #elif defined(__GNUC__) || __has_attribute(cold) #define __cold __attribute__((cold)) __optimize("Os") #else #define __cold __optimize("Os") #endif #else #define __cold #endif #endif /* __cold */ #if defined(_MSC_VER) #pragma warning(push, 1) #include <stdlib.h> #define likely(cond) (cond) #define unlikely(cond) (cond) #define unreachable() __assume(0) #define bswap64(v) byteswap_64(v) #define bswap32(v) byteswap_32(v) #define bswap16(v) byteswap_16(v) #define rot64(v, s) rotr64(v, s) #define rot32(v, s) rotr32(v, s) #define __always_inline __forceinline #ifdef TC_WINDOWS_DRIVER #undef assert #define assert ASSERT #endif #if defined(_M_X64) || defined(_M_IA64) #pragma intrinsic(_umul128) #define mul_64x64_128(a, b, ph) _umul128(a, b, ph) #endif #if defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64) #pragma intrinsic(__umulh) #define mul_64x64_high(a, b) __umulh(a, b) #endif #pragma warning(pop) #pragma warning(disable : 4514) /* 'xyz': unreferenced inline function \ has been removed */ #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for \ automatic inline expansion */ #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4702) /* unreachable code */ #define __GNUC_PREREQ(a,b) 0 #define UINT64_C(value) value ## ULL #endif /* Compiler */ #ifndef likely #define likely(cond) (cond) #endif #ifndef unlikely #define unlikely(cond) (cond) #endif #ifndef __maybe_unused #define __maybe_unused #endif #ifndef __always_inline #define __always_inline __inline #endif #ifndef unreachable #define unreachable() \ do { \ } while (1) #endif #ifndef read_unaligned #if defined(__GNUC__) || __has_attribute(packed) typedef struct { uint8_t unaligned_8; uint16_t unaligned_16; uint32_t unaligned_32; uint64_t unaligned_64; } __attribute__((packed)) t1ha_unaligned_proxy; #define read_unaligned(ptr, bits) \ (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof( \ t1ha_unaligned_proxy, unaligned_##bits))) \ ->unaligned_##bits) #elif defined(_MSC_VER) #pragma warning( \ disable : 4235) /* nonstandard extension used: '__unaligned' \ * keyword not supported on this architecture */ #define read_unaligned(ptr, bits) (*(const __unaligned uint##bits##_t *)(ptr)) #else #pragma pack(push, 1) typedef struct { uint8_t unaligned_8; uint16_t unaligned_16; uint32_t unaligned_32; uint64_t unaligned_64; } t1ha_unaligned_proxy; #pragma pack(pop) #define read_unaligned(ptr, bits) \ (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof( \ t1ha_unaligned_proxy, unaligned_##bits))) \ ->unaligned_##bits) #endif #endif /* read_unaligned */ #ifndef read_aligned #if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_assume_aligned) #define read_aligned(ptr, bits) \ (*(const uint##bits##_t *)__builtin_assume_aligned(ptr, ALIGNMENT_##bits)) #elif (__GNUC_PREREQ(3, 3) || __has_attribute(aligned)) && !defined(__clang__) #define read_aligned(ptr, bits) \ (*(const uint##bits##_t __attribute__((aligned(ALIGNMENT_##bits))) *)(ptr)) #elif __has_attribute(assume_aligned) static __always_inline const uint16_t *__attribute__((assume_aligned(ALIGNMENT_16))) cast_aligned_16(const void *ptr) { return (const uint16_t *)ptr; } static __always_inline const uint32_t *__attribute__((assume_aligned(ALIGNMENT_32))) cast_aligned_32(const void *ptr) { return (const uint32_t *)ptr; } static __always_inline const uint64_t *__attribute__((assume_aligned(ALIGNMENT_64))) cast_aligned_64(const void *ptr) { return (const uint64_t *)ptr; } #define read_aligned(ptr, bits) (*cast_aligned_##bits(ptr)) #elif defined(_MSC_VER) #define read_aligned(ptr, bits) \ (*(const __declspec(align(ALIGNMENT_##bits)) uint##bits##_t *)(ptr)) #else #define read_aligned(ptr, bits) (*(const uint##bits##_t *)(ptr)) #endif #endif /* read_aligned */ #ifndef prefetch #if (__GNUC_PREREQ(4, 0) || __has_builtin(__builtin_prefetch)) && \ !defined(__ia32__) #define prefetch(ptr) __builtin_prefetch(ptr) #elif defined(_M_ARM64) || defined(_M_ARM) #define prefetch(ptr) __prefetch(ptr) #else #define prefetch(ptr) \ do { \ (void)(ptr); \ } while (0) #endif #endif /* prefetch */ #if __has_warning("-Wconstant-logical-operand") #if defined(__clang__) #pragma clang diagnostic ignored "-Wconstant-logical-operand" #elif defined(__GNUC__) #pragma GCC diagnostic ignored "-Wconstant-logical-operand" #else #pragma warning disable "constant-logical-operand" #endif #endif /* -Wconstant-logical-operand */ #if __has_warning("-Wtautological-pointer-compare") #if defined(__clang__) #pragma clang diagnostic ignored "-Wtautological-pointer-compare" #elif defined(__GNUC__) #pragma GCC diagnostic ignored "-Wtautological-pointer-compare" #else #pragma warning disable "tautological-pointer-compare" #endif #endif /* -Wtautological-pointer-compare */ /***************************************************************************/ #if __GNUC_PREREQ(4, 0) #pragma GCC visibility push(hidden) #endif /* __GNUC_PREREQ(4,0) */ /*---------------------------------------------------------- Little Endian */ #ifndef fetch16_le_aligned static __always_inline uint16_t fetch16_le_aligned(const void *v) { assert(((uintptr_t)v) % ALIGNMENT_16 == 0); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return read_aligned(v, 16); #else return bswap16(read_aligned(v, 16)); #endif } #endif /* fetch16_le_aligned */ #ifndef fetch16_le_unaligned static __always_inline uint16_t fetch16_le_unaligned(const void *v) { #if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE const uint8_t *p = (const uint8_t *)v; return p[0] | (uint16_t)p[1] << 8; #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return read_unaligned(v, 16); #else return bswap16(read_unaligned(v, 16)); #endif } #endif /* fetch16_le_unaligned */ #ifndef fetch32_le_aligned static __always_inline uint32_t fetch32_le_aligned(const void *v) { assert(((uintptr_t)v) % ALIGNMENT_32 == 0); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return read_aligned(v, 32); #else return bswap32(read_aligned(v, 32)); #endif } #endif /* fetch32_le_aligned */ #ifndef fetch32_le_unaligned static __always_inline uint32_t fetch32_le_unaligned(const void *v) { #if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE return fetch16_le_unaligned(v) | (uint32_t)fetch16_le_unaligned((const uint8_t *)v + 2) << 16; #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return read_unaligned(v, 32); #else return bswap32(read_unaligned(v, 32)); #endif } #endif /* fetch32_le_unaligned */ #ifndef fetch64_le_aligned static __always_inline uint64_t fetch64_le_aligned(const void *v) { assert(((uintptr_t)v) % ALIGNMENT_64 == 0); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return read_aligned(v, 64); #else return bswap64(read_aligned(v, 64)); #endif } #endif /* fetch64_le_aligned */ #ifndef fetch64_le_unaligned static __always_inline uint64_t fetch64_le_unaligned(const void *v) { #if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE return fetch32_le_unaligned(v) | (uint64_t)fetch32_le_unaligned((const uint8_t *)v + 4) << 32; #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return read_unaligned(v, 64); #else return bswap64(read_unaligned(v, 64)); #endif } #endif /* fetch64_le_unaligned */ static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) { const uint8_t *const p = (const uint8_t *)v; #if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__) /* We can perform a 'oneshot' read, which is little bit faster. */ const unsigned shift = ((8 - tail) & 7) << 3; return fetch64_le_aligned(p) & ((~UINT64_C(0)) >> shift); #else uint64_t r = 0; switch (tail & 7) { default: unreachable(); /* fall through */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* For most CPUs this code is better when not needed byte reordering. */ case 0: return fetch64_le_aligned(p); case 7: r = (uint64_t)p[6] << 8; /* fall through */ case 6: r += p[5]; r <<= 8; /* fall through */ case 5: r += p[4]; r <<= 32; /* fall through */ case 4: return r + fetch32_le_aligned(p); case 3: r = (uint64_t)p[2] << 16; /* fall through */ case 2: return r + fetch16_le_aligned(p); case 1: return p[0]; #else case 0: r = p[7] << 8; /* fall through */ case 7: r += p[6]; r <<= 8; /* fall through */ case 6: r += p[5]; r <<= 8; /* fall through */ case 5: r += p[4]; r <<= 8; /* fall through */ case 4: r += p[3]; r <<= 8; /* fall through */ case 3: r += p[2]; r <<= 8; /* fall through */ case 2: r += p[1]; r <<= 8; /* fall through */ case 1: return r + p[0]; #endif } #endif /* T1HA_USE_FAST_ONESHOT_READ */ } #if T1HA_USE_FAST_ONESHOT_READ && \ T1HA_SYS_UNALIGNED_ACCESS != T1HA_UNALIGNED_ACCESS__UNABLE && \ defined(PAGESIZE) && PAGESIZE > 42 && !defined(__SANITIZE_ADDRESS__) #define can_read_underside(ptr, size) \ (((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0) #endif /* T1HA_USE_FAST_ONESHOT_READ */ static __always_inline uint64_t tail64_le_unaligned(const void *v, size_t tail) { const uint8_t *p = (const uint8_t *)v; #if defined(can_read_underside) && \ (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) /* On some systems (e.g. x86_64) we can perform a 'oneshot' read, which * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com> * for the reminder. */ const unsigned offset = (8 - tail) & 7; const unsigned shift = offset << 3; if (likely(can_read_underside(p, 8))) { p -= offset; return fetch64_le_unaligned(p) >> shift; } return fetch64_le_unaligned(p) & ((~UINT64_C(0)) >> shift); #else uint64_t r = 0; switch (tail & 7) { default: unreachable(); /* fall through */ #if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT && \ __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* For most CPUs this code is better when not needed * copying for alignment or byte reordering. */ case 0: return fetch64_le_unaligned(p); case 7: r = (uint64_t)p[6] << 8; /* fall through */ case 6: r += p[5]; r <<= 8; /* fall through */ case 5: r += p[4]; r <<= 32; /* fall through */ case 4: return r + fetch32_le_unaligned(p); case 3: r = (uint64_t)p[2] << 16; /* fall through */ case 2: return r + fetch16_le_unaligned(p); case 1: return p[0]; #else /* For most CPUs this code is better than a * copying for alignment and/or byte reordering. */ case 0: r = p[7] << 8; /* fall through */ case 7: r += p[6]; r <<= 8; /* fall through */ case 6: r += p[5]; r <<= 8; /* fall through */ case 5: r += p[4]; r <<= 8; /* fall through */ case 4: r += p[3]; r <<= 8; /* fall through */ case 3: r += p[2]; r <<= 8; /* fall through */ case 2: r += p[1]; r <<= 8; /* fall through */ case 1: return r + p[0]; #endif } #endif /* can_read_underside */ } /*------------------------------------------------------------- Big Endian */ #ifndef fetch16_be_aligned static __maybe_unused __always_inline uint16_t fetch16_be_aligned(const void *v) { assert(((uintptr_t)v) % ALIGNMENT_16 == 0); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return read_aligned(v, 16); #else return bswap16(read_aligned(v, 16)); #endif } #endif /* fetch16_be_aligned */ #ifndef fetch16_be_unaligned static __maybe_unused __always_inline uint16_t fetch16_be_unaligned(const void *v) { #if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE const uint8_t *p = (const uint8_t *)v; return (uint16_t)p[0] << 8 | p[1]; #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return read_unaligned(v, 16); #else return bswap16(read_unaligned(v, 16)); #endif } #endif /* fetch16_be_unaligned */ #ifndef fetch32_be_aligned static __maybe_unused __always_inline uint32_t fetch32_be_aligned(const void *v) { assert(((uintptr_t)v) % ALIGNMENT_32 == 0); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return read_aligned(v, 32); #else return bswap32(read_aligned(v, 32)); #endif } #endif /* fetch32_be_aligned */ #ifndef fetch32_be_unaligned static __maybe_unused __always_inline uint32_t fetch32_be_unaligned(const void *v) { #if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE return (uint32_t)fetch16_be_unaligned(v) << 16 | fetch16_be_unaligned((const uint8_t *)v + 2); #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return read_unaligned(v, 32); #else return bswap32(read_unaligned(v, 32)); #endif } #endif /* fetch32_be_unaligned */ #ifndef fetch64_be_aligned static __maybe_unused __always_inline uint64_t fetch64_be_aligned(const void *v) { assert(((uintptr_t)v) % ALIGNMENT_64 == 0); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return read_aligned(v, 64); #else return bswap64(read_aligned(v, 64)); #endif } #endif /* fetch64_be_aligned */ #ifndef fetch64_be_unaligned static __maybe_unused __always_inline uint64_t fetch64_be_unaligned(const void *v) { #if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE return (uint64_t)fetch32_be_unaligned(v) << 32 | fetch32_be_unaligned((const uint8_t *)v + 4); #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ return read_unaligned(v, 64); #else return bswap64(read_unaligned(v, 64)); #endif } #endif /* fetch64_be_unaligned */ static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v, size_t tail) { const uint8_t *const p = (const uint8_t *)v; #if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__) /* We can perform a 'oneshot' read, which is little bit faster. */ const unsigned shift = ((8 - tail) & 7) << 3; return fetch64_be_aligned(p) >> shift; #else switch (tail & 7) { default: unreachable(); /* fall through */ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ /* For most CPUs this code is better when not byte reordering. */ case 1: return p[0]; case 2: return fetch16_be_aligned(p); case 3: return (uint32_t)fetch16_be_aligned(p) << 8 | p[2]; case 4: return fetch32_be_aligned(p); case 5: return (uint64_t)fetch32_be_aligned(p) << 8 | p[4]; case 6: return (uint64_t)fetch32_be_aligned(p) << 16 | fetch16_be_aligned(p + 4); case 7: return (uint64_t)fetch32_be_aligned(p) << 24 | (uint32_t)fetch16_be_aligned(p + 4) << 8 | p[6]; case 0: return fetch64_be_aligned(p); #else case 1: return p[0]; case 2: return p[1] | (uint32_t)p[0] << 8; case 3: return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; case 4: return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | (uint32_t)p[0] << 24; case 5: return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; case 6: return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; case 7: return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | (uint64_t)p[0] << 48; case 0: return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; #endif } #endif /* T1HA_USE_FAST_ONESHOT_READ */ } static __maybe_unused __always_inline uint64_t tail64_be_unaligned(const void *v, size_t tail) { const uint8_t *p = (const uint8_t *)v; #if defined(can_read_underside) && \ (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) /* On some systems (e.g. x86_64) we can perform a 'oneshot' read, which * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com> * for the reminder. */ const unsigned offset = (8 - tail) & 7; const unsigned shift = offset << 3; if (likely(can_read_underside(p, 8))) { p -= offset; return fetch64_be_unaligned(p) & ((~UINT64_C(0)) >> shift); } return fetch64_be_unaligned(p) >> shift; #else switch (tail & 7) { default: unreachable(); /* fall through */ #if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT && \ __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ /* For most CPUs this code is better when not needed * copying for alignment or byte reordering. */ case 1: return p[0]; case 2: return fetch16_be_unaligned(p); case 3: return (uint32_t)fetch16_be_unaligned(p) << 8 | p[2]; case 4: return fetch32_be(p); case 5: return (uint64_t)fetch32_be_unaligned(p) << 8 | p[4]; case 6: return (uint64_t)fetch32_be_unaligned(p) << 16 | fetch16_be_unaligned(p + 4); case 7: return (uint64_t)fetch32_be_unaligned(p) << 24 | (uint32_t)fetch16_be_unaligned(p + 4) << 8 | p[6]; case 0: return fetch64_be_unaligned(p); #else /* For most CPUs this code is better than a * copying for alignment and/or byte reordering. */ case 1: return p[0]; case 2: return p[1] | (uint32_t)p[0] << 8; case 3: return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; case 4: return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | (uint32_t)p[0] << 24; case 5: return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; case 6: return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; case 7: return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | (uint64_t)p[0] << 48; case 0: return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; #endif } #endif /* can_read_underside */ } /***************************************************************************/ #ifndef rot64 static __always_inline uint64_t rot64(uint64_t v, unsigned s) { return (v >> s) | (v << (64 - s)); } #endif /* rot64 */ #ifndef mul_32x32_64 static __always_inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { return a * (uint64_t)b; } #endif /* mul_32x32_64 */ #ifndef add64carry_first static __maybe_unused __always_inline unsigned add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { #if __has_builtin(__builtin_addcll) unsigned long long carryout; *sum = __builtin_addcll(base, addend, 0, &carryout); return (unsigned)carryout; #else *sum = base + addend; return *sum < addend; #endif /* __has_builtin(__builtin_addcll) */ } #endif /* add64carry_fist */ #ifndef add64carry_next static __maybe_unused __always_inline unsigned add64carry_next(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) { #if __has_builtin(__builtin_addcll) unsigned long long carryout; *sum = __builtin_addcll(base, addend, carry, &carryout); return (unsigned)carryout; #else *sum = base + addend + carry; return *sum < addend || (carry && *sum == addend); #endif /* __has_builtin(__builtin_addcll) */ } #endif /* add64carry_next */ #ifndef add64carry_last static __maybe_unused __always_inline void add64carry_last(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) { #if __has_builtin(__builtin_addcll) unsigned long long carryout; *sum = __builtin_addcll(base, addend, carry, &carryout); (void)carryout; #else *sum = base + addend + carry; #endif /* __has_builtin(__builtin_addcll) */ } #endif /* add64carry_last */ #ifndef mul_64x64_128 static __maybe_unused __always_inline uint64_t mul_64x64_128(uint64_t a, uint64_t b, uint64_t *h) { #if defined(__SIZEOF_INT128__) || \ (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) __uint128_t r = (__uint128_t)a * (__uint128_t)b; /* modern GCC could nicely optimize this */ *h = (uint64_t)(r >> 64); return (uint64_t)r; #elif defined(mul_64x64_high) *h = mul_64x64_high(a, b); return a * b; #else /* performs 64x64 to 128 bit multiplication */ const uint64_t ll = mul_32x32_64((uint32_t)a, (uint32_t)b); const uint64_t lh = mul_32x32_64(a >> 32, (uint32_t)b); const uint64_t hl = mul_32x32_64((uint32_t)a, b >> 32); const uint64_t hh = mul_32x32_64(a >> 32, b >> 32); /* Few simplification are possible here for 32-bit architectures, * but thus we would lost compatibility with the original 64-bit * version. Think is very bad idea, because then 32-bit t1ha will * still (relatively) very slowly and well yet not compatible. */ uint64_t l; add64carry_last(add64carry_first(ll, lh << 32, &l), hh, lh >> 32, h); add64carry_last(add64carry_first(l, hl << 32, &l), *h, hl >> 32, h); return l; #endif } #endif /* mul_64x64_128() */ #ifndef mul_64x64_high static __maybe_unused __always_inline uint64_t mul_64x64_high(uint64_t a, uint64_t b) { uint64_t h; mul_64x64_128(a, b, &h); return h; } #endif /* mul_64x64_high */ /***************************************************************************/ /* 'magic' primes */ static const uint64_t prime_0 = UINT64_C(0xEC99BF0D8372CAAB); static const uint64_t prime_1 = UINT64_C(0x82434FE90EDCEF39); static const uint64_t prime_2 = UINT64_C(0xD4F06DB99D67BE4B); static const uint64_t prime_3 = UINT64_C(0xBD9CACC22C6E9571); static const uint64_t prime_4 = UINT64_C(0x9C06FAF4D023E3AB); static const uint64_t prime_5 = UINT64_C(0xC060724A8424F345); static const uint64_t prime_6 = UINT64_C(0xCB5AF53AE3AAAC31); /* xor high and low parts of full 128-bit product */ static __maybe_unused __always_inline uint64_t mux64(uint64_t v, uint64_t prime) { uint64_t l, h; l = mul_64x64_128(v, prime, &h); return l ^ h; } static __always_inline uint64_t final64(uint64_t a, uint64_t b) { uint64_t x = (a + rot64(b, 41)) * prime_0; uint64_t y = (rot64(a, 23) + b) * prime_6; return mux64(x ^ y, prime_5); } static __always_inline void mixup64(uint64_t *__restrict a, uint64_t *__restrict b, uint64_t v, uint64_t prime) { uint64_t h; *a ^= mul_64x64_128(*b + v, prime, &h); *b += h; }