From ba5da0946c3abaa93d1161ca512c3c326cda3736 Mon Sep 17 00:00:00 2001 From: Mounir IDRASSI Date: Fri, 8 Feb 2019 01:48:12 +0100 Subject: Windows: Add implementation of ChaCha20 based random generator. Use it for driver need of random bytes (currently only wipe bytes but more to come later). --- src/Crypto/Crypto.vcxproj | 7 ++ src/Crypto/Crypto.vcxproj.filters | 21 ++++ src/Crypto/Sources | 4 +- src/Crypto/chacha-xmm.c | 160 ++++++++++++++++++++++++++++ src/Crypto/chacha256.c | 219 ++++++++++++++++++++++++++++++++++++++ src/Crypto/chacha256.h | 31 ++++++ src/Crypto/chachaRng.c | 120 +++++++++++++++++++++ src/Crypto/chachaRng.h | 63 +++++++++++ src/Crypto/chacha_u1.h | 102 ++++++++++++++++++ src/Crypto/chacha_u4.h | 187 ++++++++++++++++++++++++++++++++ src/Crypto/config.h | 9 ++ src/Crypto/cpu.h | 10 +- 12 files changed, 929 insertions(+), 4 deletions(-) create mode 100644 src/Crypto/chacha-xmm.c create mode 100644 src/Crypto/chacha256.c create mode 100644 src/Crypto/chacha256.h create mode 100644 src/Crypto/chachaRng.c create mode 100644 src/Crypto/chachaRng.h create mode 100644 src/Crypto/chacha_u1.h create mode 100644 src/Crypto/chacha_u4.h (limited to 'src/Crypto') diff --git a/src/Crypto/Crypto.vcxproj b/src/Crypto/Crypto.vcxproj index 43ac766f..5fb52d97 100644 --- a/src/Crypto/Crypto.vcxproj +++ b/src/Crypto/Crypto.vcxproj @@ -215,6 +215,9 @@ + + + @@ -234,6 +237,10 @@ + + + + diff --git a/src/Crypto/Crypto.vcxproj.filters b/src/Crypto/Crypto.vcxproj.filters index c8d91b65..abf81655 100644 --- a/src/Crypto/Crypto.vcxproj.filters +++ b/src/Crypto/Crypto.vcxproj.filters @@ -60,6 +60,15 @@ Source Files + + Source Files + + + Source Files + + + Source Files + @@ -116,6 +125,18 @@ Header Files + + Header Files + + + Header Files + + + Header Files + + + Header Files + diff --git a/src/Crypto/Sources b/src/Crypto/Sources index 271edca6..5c44c371 100644 --- a/src/Crypto/Sources +++ b/src/Crypto/Sources @@ -6,7 +6,6 @@ INCLUDES = .. NTTARGETFILES = \ "$(OBJ_PATH)\$(O)\Aes_$(TC_ARCH).obj" \ "$(OBJ_PATH)\$(O)\Aes_hw_cpu.obj" \ - "$(OBJ_PATH)\$(O)\rdrand.obj" \ "$(OBJ_PATH)\$(O)\rdrand_ml.obj" \ "$(OBJ_PATH)\$(O)\gost89_$(TC_ARCH).obj" \ "$(OBJ_PATH)\$(O)\Twofish_$(TC_ARCH).obj" \ @@ -28,6 +27,9 @@ SOURCES = \ rdrand_ml.asm \ Aeskey.c \ Aestab.c \ + chacha-xmm.c \ + chacha256.c \ + chachaRng.c \ cpu.c \ rdrand.c \ Rmd160.c \ diff --git a/src/Crypto/chacha-xmm.c b/src/Crypto/chacha-xmm.c new file mode 100644 index 00000000..198d0b5b --- /dev/null +++ b/src/Crypto/chacha-xmm.c @@ -0,0 +1,160 @@ +/* +chacha.c version $Date: 2014/09/08 17:38:05 $ +D. J. Bernstein +Romain Dolbeau +Public domain. +*/ + +// Modified by kerukuro for use in cppcrypto. + +/* Adapted to VeraCrypt */ + +#include "Common/Tcdefs.h" +#include "config.h" +#include "cpu.h" +#include "misc.h" + +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE + +#ifndef _M_X64 +#ifdef _MSC_VER +#if _MSC_VER < 1900 +__inline __m128i _mm_set_epi64x(int64 i0, int64 i1) { + union { + int64 q[2]; + int32 r[4]; + } u; + u.q[0] = i1; u.q[1] = i0; + // this is inefficient, but other solutions are worse + return _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]); +} +#pragma warning(disable:4799) +__inline __m128i _mm_set1_epi64x(int64 a) +{ + union { + __m64 m; + long long ii; + } u; + u.ii = a; + return _mm_set1_epi64(u.m); +} +#pragma warning(default:4799) +#endif +#endif +#endif + +#define uint8 byte + +#define U32V(v) (v) +#define ROTL32(x,n) rotl32(x, n) +#define U32TO8_LITTLE(p, v) (((uint32*)(p))[0] = (v)) +#define U8TO32_LITTLE(v) *((uint32*)(v)) + + +#define ROTATE(v,c) (ROTL32(v,c)) +#define XOR(v,w) ((v) ^ (w)) +#define PLUS(v,w) (U32V((v) + (w))) +#define PLUSONE(v) (PLUS((v),1)) + +#define QUARTERROUND(a,b,c,d) \ + x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ + x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ + x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ + x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); + +static void salsa20_wordtobyte(uint8 output[64],const uint32 input[16], unsigned int r) +{ + uint32 x[16]; + int i; + + for (i = 0;i < 16;++i) x[i] = input[i]; + for (i = r;i > 0;--i) { + QUARTERROUND( 0, 4, 8,12) + QUARTERROUND( 1, 5, 9,13) + QUARTERROUND( 2, 6,10,14) + QUARTERROUND( 3, 7,11,15) + QUARTERROUND( 0, 5,10,15) + QUARTERROUND( 1, 6,11,12) + QUARTERROUND( 2, 7, 8,13) + QUARTERROUND( 3, 4, 9,14) + } + for (i = 0;i < 16;++i) x[i] = PLUS(x[i],input[i]); + for (i = 0;i < 16;++i) U32TO8_LITTLE(output + 4 * i,x[i]); +} + +void chacha_ECRYPT_init(void) +{ + return; +} + +static const char sigma[17] = "expand 32-byte k"; +static const char tau[17] = "expand 16-byte k"; + +void chacha_ECRYPT_keysetup(uint32* input,const uint8 *k,uint32 kbits,uint32 ivbits) +{ + const char *constants; + + input[4] = U8TO32_LITTLE(k + 0); + input[5] = U8TO32_LITTLE(k + 4); + input[6] = U8TO32_LITTLE(k + 8); + input[7] = U8TO32_LITTLE(k + 12); + if (kbits == 256) { /* recommended */ + k += 16; + constants = sigma; + } else { /* kbits == 128 */ + constants = tau; + } + input[8] = U8TO32_LITTLE(k + 0); + input[9] = U8TO32_LITTLE(k + 4); + input[10] = U8TO32_LITTLE(k + 8); + input[11] = U8TO32_LITTLE(k + 12); + input[0] = U8TO32_LITTLE(constants + 0); + input[1] = U8TO32_LITTLE(constants + 4); + input[2] = U8TO32_LITTLE(constants + 8); + input[3] = U8TO32_LITTLE(constants + 12); +} + +void chacha_ECRYPT_ivsetup(uint32* input,const uint8 *iv) +{ + input[12] = 0; + input[13] = 0; + input[14] = U8TO32_LITTLE(iv + 0); + input[15] = U8TO32_LITTLE(iv + 4); +} + +void chacha_ECRYPT_encrypt_bytes(size_t bytes, uint32* x, const uint8* m, uint8* out, uint8* output, unsigned int r) +{ + unsigned int i; + +#include "chacha_u4.h" + +#include "chacha_u1.h" + +#ifndef _M_X64 +#ifdef _MSC_VER +#if _MSC_VER < 1900 + _mm_empty(); +#endif +#endif +#endif + + if (!bytes) return; + for (;;) { + salsa20_wordtobyte(output,x, r); + x[12] = PLUSONE(x[12]); + if (!x[12]) { + x[13] = PLUSONE(x[13]); + /* stopping at 2^70 bytes per nonce is user's responsibility */ + } + if (bytes <= 64) { + for (i = 0;i < bytes;++i) out[i] = m[i] ^ output[i]; + return; + } + for (i = 0;i < 64;++i) out[i] = m[i] ^ output[i]; + bytes -= 64; + out += 64; + m += 64; + } +} + +#endif diff --git a/src/Crypto/chacha256.c b/src/Crypto/chacha256.c new file mode 100644 index 00000000..f32e607b --- /dev/null +++ b/src/Crypto/chacha256.c @@ -0,0 +1,219 @@ +/* +This code is written by kerukuro for cppcrypto library (http://cppcrypto.sourceforge.net/) +and released into public domain. +*/ + +/* adapted for VeraCrypt */ + +#include "chacha256.h" +#include "cpu.h" +#include "misc.h" + + + +#define rotater32(x,n) rotr32(x, n) +#define rotatel32(x,n) rotl32(x, n) + +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE +void chacha_ECRYPT_encrypt_bytes(size_t bytes, uint32* x, const unsigned char* m, unsigned char* out, unsigned char* output, unsigned int r); +#endif + +static VC_INLINE void xor_block_512(const unsigned char* in, const unsigned char* prev, unsigned char* out) +{ +#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && !defined(_UEFI) && (!defined (TC_WINDOWS_DRIVER) || (!defined (DEBUG) && defined (_WIN64))) + if (HasSSE2()) + { + __m128i b1 = _mm_loadu_si128((const __m128i*) in); + __m128i p1 = _mm_loadu_si128((const __m128i*) prev); + __m128i b2 = _mm_loadu_si128((const __m128i*) (in + 16)); + __m128i p2 = _mm_loadu_si128((const __m128i*) (prev + 16)); + + _mm_storeu_si128((__m128i*) out, _mm_xor_si128(b1, p1)); + _mm_storeu_si128((__m128i*) (out + 16), _mm_xor_si128(b2, p2)); + + b1 = _mm_loadu_si128((const __m128i*) (in + 32)); + p1 = _mm_loadu_si128((const __m128i*) (prev + 32)); + b2 = _mm_loadu_si128((const __m128i*) (in + 48)); + p2 = _mm_loadu_si128((const __m128i*) (prev + 48)); + + _mm_storeu_si128((__m128i*) (out + 32), _mm_xor_si128(b1, p1)); + _mm_storeu_si128((__m128i*) (out + 48), _mm_xor_si128(b2, p2)); + + } + else +#endif + { + int i; + for (i = 0; i < 64; i++) + out[i] = in[i] ^ prev[i]; + } + +} + +static VC_INLINE void chacha_core(uint32* x, int r) +{ + int i; + for (i = 0; i < r; i++) + { + x[0] += x[4]; + x[12] = rotatel32(x[12] ^ x[0], 16); + x[8] += x[12]; + x[4] = rotatel32(x[4] ^ x[8], 12); + x[0] += x[4]; + x[12] = rotatel32(x[12] ^ x[0], 8); + x[8] += x[12]; + x[4] = rotatel32(x[4] ^ x[8], 7); + + x[1] += x[5]; + x[13] = rotatel32(x[13] ^ x[1], 16); + x[9] += x[13]; + x[5] = rotatel32(x[5] ^ x[9], 12); + x[1] += x[5]; + x[13] = rotatel32(x[13] ^ x[1], 8); + x[9] += x[13]; + x[5] = rotatel32(x[5] ^ x[9], 7); + + x[2] += x[6]; + x[14] = rotatel32(x[14] ^ x[2], 16); + x[10] += x[14]; + x[6] = rotatel32(x[6] ^ x[10], 12); + x[2] += x[6]; + x[14] = rotatel32(x[14] ^ x[2], 8); + x[10] += x[14]; + x[6] = rotatel32(x[6] ^ x[10], 7); + + x[3] += x[7]; + x[15] = rotatel32(x[15] ^ x[3], 16); + x[11] += x[15]; + x[7] = rotatel32(x[7] ^ x[11], 12); + x[3] += x[7]; + x[15] = rotatel32(x[15] ^ x[3], 8); + x[11] += x[15]; + x[7] = rotatel32(x[7] ^ x[11], 7); + + x[0] += x[5]; + x[15] = rotatel32(x[15] ^ x[0], 16); + x[10] += x[15]; + x[5] = rotatel32(x[5] ^ x[10], 12); + x[0] += x[5]; + x[15] = rotatel32(x[15] ^ x[0], 8); + x[10] += x[15]; + x[5] = rotatel32(x[5] ^ x[10], 7); + + x[1] += x[6]; + x[12] = rotatel32(x[12] ^ x[1], 16); + x[11] += x[12]; + x[6] = rotatel32(x[6] ^ x[11], 12); + x[1] += x[6]; + x[12] = rotatel32(x[12] ^ x[1], 8); + x[11] += x[12]; + x[6] = rotatel32(x[6] ^ x[11], 7); + + x[2] += x[7]; + x[13] = rotatel32(x[13] ^ x[2], 16); + x[8] += x[13]; + x[7] = rotatel32(x[7] ^ x[8], 12); + x[2] += x[7]; + x[13] = rotatel32(x[13] ^ x[2], 8); + x[8] += x[13]; + x[7] = rotatel32(x[7] ^ x[8], 7); + + x[3] += x[4]; + x[14] = rotatel32(x[14] ^ x[3], 16); + x[9] += x[14]; + x[4] = rotatel32(x[4] ^ x[9], 12); + x[3] += x[4]; + x[14] = rotatel32(x[14] ^ x[3], 8); + x[9] += x[14]; + x[4] = rotatel32(x[4] ^ x[9], 7); + } +} + +static VC_INLINE void chacha_hash(const uint32* in, uint32* out, int r) +{ + uint32 x[16]; + int i; + memcpy(x, in, 64); + chacha_core(x, r); + for (i = 0; i < 16; ++i) + out[i] = x[i] + in[i]; +} + +static VC_INLINE void incrementSalsaCounter(uint32* input, uint32* block, int r) +{ + chacha_hash(input, block, r); + if (!++input[12]) + ++input[13]; +} + +static VC_INLINE void do_encrypt(const unsigned char* in, size_t len, unsigned char* out, int r, size_t* posPtr, uint32* input, uint32* block) +{ + size_t i = 0, pos = *posPtr; + if (pos) + { + while (pos < len && pos < 64) + { + out[i] = in[i] ^ ((unsigned char*)block)[pos++]; + ++i; + } + len -= i; + } + if (len) + pos = 0; + +#if CRYPTOPP_SSSE3_AVAILABLE && !defined(_UEFI) && (!defined (TC_WINDOWS_DRIVER) || (!defined (DEBUG) && defined (_WIN64))) + if (HasSSSE3()) + { + size_t fullblocks = len - len % 64; + if (fullblocks) + { + chacha_ECRYPT_encrypt_bytes(fullblocks, input, in + i, out + i, (unsigned char*)block, r); + i += fullblocks; + len -= fullblocks; + } + if (len) + { + chacha_ECRYPT_encrypt_bytes(len, input, in + i, out + i, (unsigned char*)block, r); + pos = len; + } + *posPtr = pos; + return; + } +#endif + + for (; len; len -= VC_MIN(64, len)) + { + incrementSalsaCounter(input, block, r); + if (len >= 64) + { + xor_block_512(in + i, (unsigned char*)block, out + i); + i += 64; + } + else + { + for (; pos < len; pos++, i++) + out[i] = in[i] ^ ((unsigned char*)block)[pos]; + } + } + *posPtr = pos; +} + +void ChaCha256Init(ChaCha256Ctx* ctx, const unsigned char* key, const unsigned char* iv, int rounds) +{ + ctx->internalRounds = rounds / 2; + ctx->pos = 0; + + ctx->input_[12] = 0; + ctx->input_[13] = 0; + memcpy(ctx->input_ + 4, key, 32); + memcpy(ctx->input_ + 14, iv, 8); + ctx->input_[0] = 0x61707865; + ctx->input_[1] = 0x3320646E; + ctx->input_[2] = 0x79622D32; + ctx->input_[3] = 0x6B206574; +} + +void ChaCha256Encrypt(ChaCha256Ctx* ctx, const unsigned char* in, size_t len, unsigned char* out) +{ + do_encrypt(in, len, out, ctx->internalRounds, &ctx->pos, ctx->input_, ctx->block_); +} diff --git a/src/Crypto/chacha256.h b/src/Crypto/chacha256.h new file mode 100644 index 00000000..e9533a39 --- /dev/null +++ b/src/Crypto/chacha256.h @@ -0,0 +1,31 @@ +#ifndef HEADER_Crypto_ChaCha256 +#define HEADER_Crypto_ChaCha256 + +#include "Common/Tcdefs.h" +#include "config.h" + +typedef struct +{ + CRYPTOPP_ALIGN_DATA(16) uint32 block_[16]; + CRYPTOPP_ALIGN_DATA(16) uint32 input_[16]; + size_t pos; + int internalRounds; +} ChaCha256Ctx; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * key must be 32 bytes long and iv must be 8 bytes long + */ +void ChaCha256Init(ChaCha256Ctx* ctx, const unsigned char* key, const unsigned char* iv, int rounds); +void ChaCha256Encrypt(ChaCha256Ctx* ctx, const unsigned char* in, size_t len, unsigned char* out); +#define ChaCha256Decrypt ChaCha256Encrypt + +#ifdef __cplusplus +} +#endif + +#endif // HEADER_Crypto_ChaCha + diff --git a/src/Crypto/chachaRng.c b/src/Crypto/chachaRng.c new file mode 100644 index 00000000..b3d92039 --- /dev/null +++ b/src/Crypto/chachaRng.c @@ -0,0 +1,120 @@ +/* $OpenBSD: arc4random.c,v 1.54 2015/09/13 08:31:47 guenther Exp $ */ + +/* + * Copyright (c) 1996, David Mazieres + * Copyright (c) 2008, Damien Miller + * Copyright (c) 2013, Markus Friedl + * Copyright (c) 2014, Theo de Raadt + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * ChaCha based random number generator for OpenBSD. + */ + +/* + * Adapted for VeraCrypt + */ + +#include "chachaRng.h" +#include "cpu.h" +#include "misc.h" +#include + +static VC_INLINE void ChaCha20RngReKey (ChaCha20RngCtx* pCtx, int useCallBack) +{ + /* fill rs_buf with the keystream */ + if (pCtx->m_rs_have) + memset(pCtx->m_rs_buf + sizeof(pCtx->m_rs_buf) - pCtx->m_rs_have, 0, pCtx->m_rs_have); + ChaCha256Encrypt(&pCtx->m_chachaCtx, pCtx->m_rs_buf, sizeof (pCtx->m_rs_buf), + pCtx->m_rs_buf); + /* mix in optional user provided data */ + if (pCtx->m_getRandSeedCallback && useCallBack) { + unsigned char dat[CHACHA20RNG_KEYSZ + CHACHA20RNG_IVSZ]; + size_t i; + + pCtx->m_getRandSeedCallback (dat, sizeof (dat)); + + for (i = 0; i < (CHACHA20RNG_KEYSZ + CHACHA20RNG_IVSZ); i++) + pCtx->m_rs_buf[i] ^= dat[i]; + + burn (dat, sizeof(dat)); + } + + /* immediately reinit for backtracking resistance */ + ChaCha256Init (&pCtx->m_chachaCtx, pCtx->m_rs_buf, pCtx->m_rs_buf + CHACHA20RNG_KEYSZ, 20); + memset(pCtx->m_rs_buf, 0, CHACHA20RNG_KEYSZ + CHACHA20RNG_IVSZ); + pCtx->m_rs_have = sizeof (pCtx->m_rs_buf) - CHACHA20RNG_KEYSZ - CHACHA20RNG_IVSZ; +} + +static VC_INLINE void ChaCha20RngStir(ChaCha20RngCtx* pCtx) +{ + ChaCha20RngReKey (pCtx, 1); + + /* invalidate rs_buf */ + pCtx->m_rs_have = 0; + memset(pCtx->m_rs_buf, 0, CHACHA20RNG_RSBUFSZ); + + pCtx->m_rs_count = 1600000; +} + +static VC_INLINE void ChaCha20RngStirIfNeeded(ChaCha20RngCtx* pCtx, size_t len) +{ + if (pCtx->m_rs_count <= len) { + ChaCha20RngStir(pCtx); + } else + pCtx->m_rs_count -= len; +} + +void ChaCha20RngInit (ChaCha20RngCtx* pCtx, const unsigned char* key, GetRandSeedFn rngSeedCallback, size_t InitialBytesToSkip) +{ + ChaCha256Init (&pCtx->m_chachaCtx, key, key + 32, 20); + pCtx->m_getRandSeedCallback = rngSeedCallback; + + /* fill rs_buf with the keystream */ + pCtx->m_rs_have = 0; + memset (pCtx->m_rs_buf, 0, sizeof (pCtx->m_rs_buf)); + pCtx->m_rs_count = 1600000; + + ChaCha20RngReKey(pCtx, 0); + + if (InitialBytesToSkip) + ChaCha20RngGetBytes (pCtx, NULL, InitialBytesToSkip); +} + +void ChaCha20RngGetBytes (ChaCha20RngCtx* pCtx, unsigned char* buffer, size_t bufferLen) +{ + unsigned char *buf = (unsigned char*) buffer; + unsigned char* keystream; + size_t m; + + ChaCha20RngStirIfNeeded(pCtx, bufferLen); + + while (bufferLen > 0) { + if (pCtx->m_rs_have > 0) { + m = VC_MIN(bufferLen, pCtx->m_rs_have); + keystream = pCtx->m_rs_buf + sizeof(pCtx->m_rs_buf) - pCtx->m_rs_have; + if (buf) + { + memcpy(buf, keystream, m); + buf += m; + } + memset(keystream, 0, m); + bufferLen -= m; + pCtx->m_rs_have -= m; + } + if (pCtx->m_rs_have == 0) + ChaCha20RngReKey (pCtx, 0); + } +} diff --git a/src/Crypto/chachaRng.h b/src/Crypto/chachaRng.h new file mode 100644 index 00000000..a2cb4ce8 --- /dev/null +++ b/src/Crypto/chachaRng.h @@ -0,0 +1,63 @@ +/* $OpenBSD: arc4random.c,v 1.54 2015/09/13 08:31:47 guenther Exp $ */ + +/* + * Copyright (c) 1996, David Mazieres + * Copyright (c) 2008, Damien Miller + * Copyright (c) 2013, Markus Friedl + * Copyright (c) 2014, Theo de Raadt + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * ChaCha based random number generator for OpenBSD. + */ + +/* + * Adapted for VeraCrypt + */ + +#ifndef HEADER_Crypto_ChaChaRng +#define HEADER_Crypto_ChaChaRng + +#include "chacha256.h" + +#define CHACHA20RNG_KEYSZ 32 +#define CHACHA20RNG_IVSZ 8 +#define CHACHA20RNG_BLOCKSZ 64 +#define CHACHA20RNG_RSBUFSZ (16*CHACHA20RNG_BLOCKSZ) + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*GetRandSeedFn)(unsigned char* pbRandSeed, size_t cbRandSeed); + +typedef struct +{ + ChaCha256Ctx m_chachaCtx; /* ChaCha20 context */ + unsigned char m_rs_buf[CHACHA20RNG_RSBUFSZ]; /* keystream blocks */ + size_t m_rs_have; /* valid bytes at end of rs_buf */ + size_t m_rs_count; /* bytes till reseed */ + GetRandSeedFn m_getRandSeedCallback; +} ChaCha20RngCtx; + +/* key length must be equal to 40 bytes (CHACHA20RNG_KEYSZ + CHACHA20RNG_IVSZ) */ +void ChaCha20RngInit (ChaCha20RngCtx* pCtx, const unsigned char* key, GetRandSeedFn rngCallback, size_t InitialBytesToSkip); +void ChaCha20RngGetBytes (ChaCha20RngCtx* pCtx, unsigned char* buffer, size_t bufferLen); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/Crypto/chacha_u1.h b/src/Crypto/chacha_u1.h new file mode 100644 index 00000000..e77bc1ea --- /dev/null +++ b/src/Crypto/chacha_u1.h @@ -0,0 +1,102 @@ +/* +u1.h version $Date: 2014/09/08 17:44:28 $ +D. J. Bernstein +Romain Dolbeau +Public domain. +*/ + +// Modified by kerukuro for use in cppcrypto. + +// if (!bytes) return; + while (bytes >=64) { + __m128i x_0, x_1, x_2, x_3; + __m128i t_1; + const __m128i rot16 = _mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); + const __m128i rot8 = _mm_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3); + uint32 in12, in13; + + x_0 = _mm_load_si128((__m128i*)(x + 0)); + x_1 = _mm_load_si128((__m128i*)(x + 4)); + x_2 = _mm_load_si128((__m128i*)(x + 8)); + x_3 = _mm_load_si128((__m128i*)(x + 12)); + + for (i = 0 ; i < r ; ++i) { + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_3 = _mm_shuffle_epi8(x_3, rot16); + + x_2 = _mm_add_epi32(x_2, x_3); + x_1 = _mm_xor_si128(x_1, x_2); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 12); + t_1 = _mm_srli_epi32(t_1, 20); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_0 = _mm_shuffle_epi32(x_0, 0x93); + x_3 = _mm_shuffle_epi8(x_3, rot8); + + x_2 = _mm_add_epi32(x_2, x_3); + x_3 = _mm_shuffle_epi32(x_3, 0x4e); + x_1 = _mm_xor_si128(x_1, x_2); + x_2 = _mm_shuffle_epi32(x_2, 0x39); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 7); + t_1 = _mm_srli_epi32(t_1, 25); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_3 = _mm_shuffle_epi8(x_3, rot16); + + x_2 = _mm_add_epi32(x_2, x_3); + x_1 = _mm_xor_si128(x_1, x_2); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 12); + t_1 = _mm_srli_epi32(t_1, 20); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_0 = _mm_shuffle_epi32(x_0, 0x39); + x_3 = _mm_shuffle_epi8(x_3, rot8); + + x_2 = _mm_add_epi32(x_2, x_3); + x_3 = _mm_shuffle_epi32(x_3, 0x4e); + x_1 = _mm_xor_si128(x_1, x_2); + x_2 = _mm_shuffle_epi32(x_2, 0x93); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 7); + t_1 = _mm_srli_epi32(t_1, 25); + x_1 = _mm_xor_si128(x_1, t_1); + } + x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x + 0))); + x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x + 4))); + x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x + 8))); + x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12))); + x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*)(m + 0))); + x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*)(m + 16))); + x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*)(m + 32))); + x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*)(m + 48))); + _mm_storeu_si128((__m128i*)(out + 0), x_0); + _mm_storeu_si128((__m128i*)(out + 16), x_1); + _mm_storeu_si128((__m128i*)(out + 32), x_2); + _mm_storeu_si128((__m128i*)(out + 48), x_3); + + in12 = x[12]; + in13 = x[13]; + in12 ++; + if (in12 == 0) + in13 ++; + x[12] = in12; + x[13] = in13; + + bytes -= 64; + out += 64; + m += 64; + } diff --git a/src/Crypto/chacha_u4.h b/src/Crypto/chacha_u4.h new file mode 100644 index 00000000..8eef5dc5 --- /dev/null +++ b/src/Crypto/chacha_u4.h @@ -0,0 +1,187 @@ +/* +u4.h version $Date: 2014/11/11 10:46:58 $ +D. J. Bernstein +Romain Dolbeau +Public domain. +*/ + +// Modified by kerukuro for use in cppcrypto. + +#define VEC4_ROT(a,imm) _mm_or_si128(_mm_slli_epi32(a,imm),_mm_srli_epi32(a,(32-imm))) + +/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 & 16) (better) */ +#define VEC4_QUARTERROUND_SHUFFLE(a,b,c,d) \ + x_##a = _mm_add_epi32(x_##a, x_##b); t_##a = _mm_xor_si128(x_##d, x_##a); x_##d = _mm_shuffle_epi8(t_##a, rot16); \ + x_##c = _mm_add_epi32(x_##c, x_##d); t_##c = _mm_xor_si128(x_##b, x_##c); x_##b = VEC4_ROT(t_##c, 12); \ + x_##a = _mm_add_epi32(x_##a, x_##b); t_##a = _mm_xor_si128(x_##d, x_##a); x_##d = _mm_shuffle_epi8(t_##a, rot8); \ + x_##c = _mm_add_epi32(x_##c, x_##d); t_##c = _mm_xor_si128(x_##b, x_##c); x_##b = VEC4_ROT(t_##c, 7) + +#define VEC4_QUARTERROUND(a,b,c,d) VEC4_QUARTERROUND_SHUFFLE(a,b,c,d) + + +// if (!bytes) return; +if (bytes>=256) { + /* constant for shuffling bytes (replacing multiple-of-8 rotates) */ + __m128i rot16 = _mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2); + __m128i rot8 = _mm_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3); + uint32 in12, in13; + __m128i x_0 = _mm_set1_epi32(x[0]); + __m128i x_1 = _mm_set1_epi32(x[1]); + __m128i x_2 = _mm_set1_epi32(x[2]); + __m128i x_3 = _mm_set1_epi32(x[3]); + __m128i x_4 = _mm_set1_epi32(x[4]); + __m128i x_5 = _mm_set1_epi32(x[5]); + __m128i x_6 = _mm_set1_epi32(x[6]); + __m128i x_7 = _mm_set1_epi32(x[7]); + __m128i x_8 = _mm_set1_epi32(x[8]); + __m128i x_9 = _mm_set1_epi32(x[9]); + __m128i x_10 = _mm_set1_epi32(x[10]); + __m128i x_11 = _mm_set1_epi32(x[11]); + __m128i x_12;// = _mm_set1_epi32(x[12]); /* useless */ + __m128i x_13;// = _mm_set1_epi32(x[13]); /* useless */ + __m128i x_14 = _mm_set1_epi32(x[14]); + __m128i x_15 = _mm_set1_epi32(x[15]); + __m128i orig0 = x_0; + __m128i orig1 = x_1; + __m128i orig2 = x_2; + __m128i orig3 = x_3; + __m128i orig4 = x_4; + __m128i orig5 = x_5; + __m128i orig6 = x_6; + __m128i orig7 = x_7; + __m128i orig8 = x_8; + __m128i orig9 = x_9; + __m128i orig10 = x_10; + __m128i orig11 = x_11; + __m128i orig12;// = x_12; /* useless */ + __m128i orig13;// = x_13; /* useless */ + __m128i orig14 = x_14; + __m128i orig15 = x_15; + __m128i t_0; + __m128i t_1; + __m128i t_2; + __m128i t_3; + __m128i t_4; + __m128i t_5; + __m128i t_6; + __m128i t_7; + __m128i t_8; + __m128i t_9; + __m128i t_10; + __m128i t_11; + __m128i t_12; + __m128i t_13; + __m128i t_14; + __m128i t_15; + + while (bytes >= 256) { + const __m128i addv12 = _mm_set_epi64x(1,0); + const __m128i addv13 = _mm_set_epi64x(3,2); + __m128i t12, t13; + uint64 in1213; + + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + //x_12 = orig12; /* useless */ + //x_13 = orig13; /* useless */ + x_14 = orig14; + x_15 = orig15; + + + + + in12 = x[12]; + in13 = x[13]; + in1213 = ((uint64)in12) | (((uint64)in13) << 32); + t12 = _mm_set1_epi64x(in1213); + t13 = _mm_set1_epi64x(in1213); + + x_12 = _mm_add_epi64(addv12, t12); + x_13 = _mm_add_epi64(addv13, t13); + + t12 = _mm_unpacklo_epi32(x_12, x_13); + t13 = _mm_unpackhi_epi32(x_12, x_13); + + x_12 = _mm_unpacklo_epi32(t12, t13); + x_13 = _mm_unpackhi_epi32(t12, t13); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 4; + + x[12] = in1213 & 0xFFFFFFFF; + x[13] = (in1213>>32)&0xFFFFFFFF; + + for (i = 0 ; i < r ; ++i) { + VEC4_QUARTERROUND( 0, 4, 8,12); + VEC4_QUARTERROUND( 1, 5, 9,13); + VEC4_QUARTERROUND( 2, 6,10,14); + VEC4_QUARTERROUND( 3, 7,11,15); + VEC4_QUARTERROUND( 0, 5,10,15); + VEC4_QUARTERROUND( 1, 6,11,12); + VEC4_QUARTERROUND( 2, 7, 8,13); + VEC4_QUARTERROUND( 3, 4, 9,14); + } + +#define ONEQUAD_TRANSPOSE(a,b,c,d) \ + { \ + __m128i t0, t1, t2, t3; \ + x_##a = _mm_add_epi32(x_##a, orig##a); \ + x_##b = _mm_add_epi32(x_##b, orig##b); \ + x_##c = _mm_add_epi32(x_##c, orig##c); \ + x_##d = _mm_add_epi32(x_##d, orig##d); \ + t_##a = _mm_unpacklo_epi32(x_##a, x_##b); \ + t_##b = _mm_unpacklo_epi32(x_##c, x_##d); \ + t_##c = _mm_unpackhi_epi32(x_##a, x_##b); \ + t_##d = _mm_unpackhi_epi32(x_##c, x_##d); \ + x_##a = _mm_unpacklo_epi64(t_##a, t_##b); \ + x_##b = _mm_unpackhi_epi64(t_##a, t_##b); \ + x_##c = _mm_unpacklo_epi64(t_##c, t_##d); \ + x_##d = _mm_unpackhi_epi64(t_##c, t_##d); \ + t0 = _mm_xor_si128(x_##a, _mm_loadu_si128((__m128i*)(m+0))); \ + _mm_storeu_si128((__m128i*)(out+0),t0); \ + t1 = _mm_xor_si128(x_##b, _mm_loadu_si128((__m128i*)(m+64))); \ + _mm_storeu_si128((__m128i*)(out+64),t1); \ + t2 = _mm_xor_si128(x_##c, _mm_loadu_si128((__m128i*)(m+128))); \ + _mm_storeu_si128((__m128i*)(out+128),t2); \ + t3 = _mm_xor_si128(x_##d, _mm_loadu_si128((__m128i*)(m+192))); \ + _mm_storeu_si128((__m128i*)(out+192),t3); \ + } + +#define ONEQUAD(a,b,c,d) ONEQUAD_TRANSPOSE(a,b,c,d) + + ONEQUAD(0,1,2,3); + m+=16; + out+=16; + ONEQUAD(4,5,6,7); + m+=16; + out+=16; + ONEQUAD(8,9,10,11); + m+=16; + out+=16; + ONEQUAD(12,13,14,15); + m-=48; + out-=48; + +#undef ONEQUAD +#undef ONEQUAD_TRANSPOSE + + bytes -= 256; + out += 256; + m += 256; + } + } +#undef VEC4_ROT +#undef VEC4_QUARTERROUND +#undef VEC4_QUARTERROUND_SHUFFLE diff --git a/src/Crypto/config.h b/src/Crypto/config.h index 396be932..cf6f3dc3 100644 --- a/src/Crypto/config.h +++ b/src/Crypto/config.h @@ -119,6 +119,15 @@ #define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0 #endif +#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SSSE3) && ( \ + defined(__SSSE3__) || (_MSC_VER >= 1500) || \ + (CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1000) || (__SUNPRO_CC >= 0x5110) || \ + (CRYPTOPP_LLVM_CLANG_VERSION >= 20300) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40000)) + #define CRYPTOPP_SSSE3_AVAILABLE 1 +#else + #define CRYPTOPP_SSSE3_AVAILABLE 0 +# endif + #if !defined(CRYPTOPP_DISABLE_SSSE3) && !defined(CRYPTOPP_DISABLE_AESNI) && CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && (CRYPTOPP_GCC_VERSION >= 40400 || _MSC_FULL_VER >= 150030729 || __INTEL_COMPILER >= 1110 || defined(__AES__)) #define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1 #else diff --git a/src/Crypto/cpu.h b/src/Crypto/cpu.h index 4dd5d88f..9fac453b 100644 --- a/src/Crypto/cpu.h +++ b/src/Crypto/cpu.h @@ -89,8 +89,10 @@ extern __m128i _mm_unpacklo_epi64(__m128i _A, __m128i _B); extern void _mm_store_si128(__m128i *_P, __m128i _B); extern __m64 _m_pxor(__m64 _MM1, __m64 _MM2); extern __m128i _mm_set_epi64(__m64 _Q1, __m64 _Q0); +extern __m128i _mm_set1_epi64(__m64 q); extern __m128i _mm_setr_epi32(int _I0, int _I1, int _I2, int _I3); extern __m128i _mm_loadu_si128(__m128i const*_P); +extern __m128i _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0); extern __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0); extern __m128i _mm_set1_epi32(int _I); extern void _mm_storeu_si128(__m128i *_P, __m128i _B); @@ -99,6 +101,7 @@ extern __m128i _mm_slli_epi32(__m128i _A, int _Count); extern __m128i _mm_srli_epi32(__m128i _A, int _Count); extern __m128i _mm_add_epi32(__m128i _A, __m128i _B); extern __m128i _mm_sub_epi32(__m128i _A, __m128i _B); +extern __m128i _mm_add_epi64 (__m128i a, __m128i b); extern __m128i _mm_or_si128(__m128i _A, __m128i _B); extern __m128i _mm_and_si128(__m128i _A, __m128i _B); extern __m128i _mm_andnot_si128(__m128i _A, __m128i _B); @@ -109,6 +112,9 @@ extern __m128i _mm_unpackhi_epi32(__m128i _A, __m128i _B); extern __m128i _mm_unpackhi_epi64(__m128i _A, __m128i _B); extern __m128i _mm_srli_epi16(__m128i _A, int _Count); extern __m128i _mm_slli_epi16(__m128i _A, int _Count); +extern __m128i _mm_shuffle_epi32 (__m128i a, int imm8); +extern __m128i _mm_set_epi64x (__int64 e1, __int64 e0); +extern __m128i _mm_set1_epi64x (__int64 a); #define _mm_xor_si64 _m_pxor #define _mm_empty _m_empty #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \ @@ -122,8 +128,7 @@ extern __m128i _mm_slli_epi16(__m128i _A, int _Count); #endif #endif -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE -#if defined(__SSSE3__) || defined(__INTEL_COMPILER) +#if CRYPTOPP_SSSE3_AVAILABLE || defined(__INTEL_COMPILER) #if defined(TC_WINDOWS_DRIVER) || defined (_UEFI) #if defined(__cplusplus) extern "C" { @@ -135,7 +140,6 @@ extern __m128i _mm_shuffle_epi8 (__m128i a, __m128i b); #else #include #endif -#endif #if defined(__SSE4_1__) || defined(__INTEL_COMPILER) || defined(_MSC_VER) #if defined(TC_WINDOWS_DRIVER) || defined (_UEFI) -- cgit v1.2.3