From 546d6cff4447a56bbf7c0e1a8b6f89dba5d3183b Mon Sep 17 00:00:00 2001 From: Mounir IDRASSI Date: Fri, 23 Jun 2017 02:07:32 +0200 Subject: Crypto: Add optimized SHA-512 and SHA-256 assembly implementations for x86_64 and x86. This improves speed by 30%. --- src/Common/Pkcs5.c | 21 + src/Common/Tests.c | 4 +- src/Crypto/Crypto.vcxproj | 83 +++ src/Crypto/Crypto.vcxproj.filters | 27 + src/Crypto/Makefile.inc | 33 +- src/Crypto/Sha2.c | 1401 ++++++++++++++++++++----------------- src/Crypto/Sha2.h | 151 +--- src/Crypto/Sources | 21 +- src/Crypto/sha256-x64-nayuki.S | 6 + src/Crypto/sha256-x86-nayuki.S | 168 +++++ src/Crypto/sha256_avx1_x64.asm | 596 ++++++++++++++++ src/Crypto/sha256_avx1_x86.asm | 10 + src/Crypto/sha256_avx2_x64.asm | 840 ++++++++++++++++++++++ src/Crypto/sha256_avx2_x86.asm | 10 + src/Crypto/sha256_sse4_x64.asm | 560 +++++++++++++++ src/Crypto/sha256_sse4_x86.asm | 10 + src/Crypto/sha512-x64-nayuki.S | 202 ++++++ src/Crypto/sha512-x86-nayuki.S | 180 +++++ src/Crypto/sha512_avx1_x64.asm | 427 +++++++++++ src/Crypto/sha512_avx1_x86.asm | 10 + src/Crypto/sha512_avx2_x64.asm | 804 +++++++++++++++++++++ src/Crypto/sha512_avx2_x86.asm | 10 + src/Crypto/sha512_sse4_x64.asm | 416 +++++++++++ src/Crypto/sha512_sse4_x86.asm | 10 + src/Driver/DriveFilter.c | 4 +- src/Driver/Driver.vcxproj | 27 + src/Driver/Driver.vcxproj.filters | 27 + src/Volume/Volume.make | 44 ++ 28 files changed, 5313 insertions(+), 789 deletions(-) create mode 100644 src/Crypto/sha256-x64-nayuki.S create mode 100644 src/Crypto/sha256-x86-nayuki.S create mode 100644 src/Crypto/sha256_avx1_x64.asm create mode 100644 src/Crypto/sha256_avx1_x86.asm create mode 100644 src/Crypto/sha256_avx2_x64.asm create mode 100644 src/Crypto/sha256_avx2_x86.asm create mode 100644 src/Crypto/sha256_sse4_x64.asm create mode 100644 src/Crypto/sha256_sse4_x86.asm create mode 100644 src/Crypto/sha512-x64-nayuki.S create mode 100644 src/Crypto/sha512-x86-nayuki.S create mode 100644 src/Crypto/sha512_avx1_x64.asm create mode 100644 src/Crypto/sha512_avx1_x86.asm create mode 100644 src/Crypto/sha512_avx2_x64.asm create mode 100644 src/Crypto/sha512_avx2_x86.asm create mode 100644 src/Crypto/sha512_sse4_x64.asm create mode 100644 src/Crypto/sha512_sse4_x86.asm (limited to 'src') diff --git a/src/Common/Pkcs5.c b/src/Common/Pkcs5.c index 1da5e237..c33f1dab 100644 --- a/src/Common/Pkcs5.c +++ b/src/Common/Pkcs5.c @@ -327,6 +327,12 @@ void hmac_sha512 char* buf = hmac.k; int b; char key[SHA512_DIGESTSIZE]; +#if defined (DEVICE_DRIVER) && !defined (_WIN64) + KFLOATING_SAVE floatingPointState; + NTSTATUS saveStatus = STATUS_SUCCESS; + if (HasSSE2() && HasMMX()) + saveStatus = KeSaveFloatingPointState (&floatingPointState); +#endif /* If the key is longer than the hash algorithm block size, let key = sha512(key), as per HMAC specifications. */ @@ -369,6 +375,11 @@ void hmac_sha512 hmac_sha512_internal (d, ld, &hmac); +#if defined (DEVICE_DRIVER) && !defined (_WIN64) + if (NT_SUCCESS (saveStatus) && (HasSSE2() && HasMMX())) + KeRestoreFloatingPointState (&floatingPointState); +#endif + /* Prevent leaks */ burn (&hmac, sizeof(hmac)); burn (key, sizeof(key)); @@ -408,6 +419,12 @@ void derive_key_sha512 (char *pwd, int pwd_len, char *salt, int salt_len, uint32 char* buf = hmac.k; int b, l, r; char key[SHA512_DIGESTSIZE]; +#if defined (DEVICE_DRIVER) && !defined (_WIN64) + KFLOATING_SAVE floatingPointState; + NTSTATUS saveStatus = STATUS_SUCCESS; + if (HasSSE2() && HasMMX()) + saveStatus = KeSaveFloatingPointState (&floatingPointState); +#endif /* If the password is longer than the hash algorithm block size, let pwd = sha512(pwd), as per HMAC specifications. */ @@ -471,6 +488,10 @@ void derive_key_sha512 (char *pwd, int pwd_len, char *salt, int salt_len, uint32 derive_u_sha512 (salt, salt_len, iterations, b, &hmac); memcpy (dk, hmac.u, r); +#if defined (DEVICE_DRIVER) && !defined (_WIN64) + if (NT_SUCCESS (saveStatus) && (HasSSE2() && HasMMX())) + KeRestoreFloatingPointState (&floatingPointState); +#endif /* Prevent possible leaks. */ burn (&hmac, sizeof(hmac)); diff --git a/src/Common/Tests.c b/src/Common/Tests.c index cf0c8699..c70954a6 100644 --- a/src/Common/Tests.c +++ b/src/Common/Tests.c @@ -584,7 +584,7 @@ BOOL RunHashTest (HashFunction fn, HashTestVector* vector, BOOL bUseSSE) #if defined (DEVICE_DRIVER) && !defined (_WIN64) KFLOATING_SAVE floatingPointState; NTSTATUS saveStatus = STATUS_SUCCESS; - if (bUseSSE && (HasSSE2() || HasSSE41())) + if (bUseSSE && (HasISSE() || HasSSE2())) saveStatus = KeSaveFloatingPointState (&floatingPointState); #endif while (vector[i].hexInput && vector[i].hexOutput) @@ -601,7 +601,7 @@ BOOL RunHashTest (HashFunction fn, HashTestVector* vector, BOOL bUseSSE) } #if defined (DEVICE_DRIVER) && !defined (_WIN64) - if (NT_SUCCESS (saveStatus) && bUseSSE && (HasSSE2() || HasSSE41())) + if (NT_SUCCESS (saveStatus) && bUseSSE && (HasISSE() || HasSSE2())) KeRestoreFloatingPointState (&floatingPointState); #endif diff --git a/src/Crypto/Crypto.vcxproj b/src/Crypto/Crypto.vcxproj index d7b686b1..c57f54d0 100644 --- a/src/Crypto/Crypto.vcxproj +++ b/src/Crypto/Crypto.vcxproj @@ -284,6 +284,89 @@ $(TargetDir)\%(Filename).obj;%(Outputs) + + + true + true + Document + echo %(Filename)%(Extension) & vsyasm.exe -Xvc -p gas -D WINABI -f win32 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + echo %(Filename)%(Extension) & vsyasm.exe -Xvc -p gas -D WINABI -f win32 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + + + true + true + Document + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + $(TargetDir)\%(Filename).obj;%(Outputs) + + + true + true + Document + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + $(TargetDir)\%(Filename).obj;%(Outputs) + + + true + true + Document + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + $(TargetDir)\%(Filename).obj;%(Outputs) + + + true + true + Document + echo %(Filename)%(Extension) & vsyasm.exe -Xvc -p gas -D WINABI -f win32 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + echo %(Filename)%(Extension) & vsyasm.exe -Xvc -p gas -D WINABI -f win32 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + + + true + true + Document + echo %(Filename)%(Extension) & yasm.exe -Xvc -p gas -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + echo %(Filename)%(Extension) & yasm.exe -Xvc -p gas -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + + + true + true + Document + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + $(TargetDir)\%(Filename).obj;%(Outputs) + + + true + true + Document + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + $(TargetDir)\%(Filename).obj;%(Outputs) + + + true + true + Document + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + echo %(Filename)%(Extension) & yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)" + $(TargetDir)\%(Filename).obj;%(Outputs) + $(TargetDir)\%(Filename).obj;%(Outputs) + + diff --git a/src/Crypto/Crypto.vcxproj.filters b/src/Crypto/Crypto.vcxproj.filters index d94e0bc4..b0122300 100644 --- a/src/Crypto/Crypto.vcxproj.filters +++ b/src/Crypto/Crypto.vcxproj.filters @@ -130,5 +130,32 @@ Source Files + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + \ No newline at end of file diff --git a/src/Crypto/Makefile.inc b/src/Crypto/Makefile.inc index 9fecd39e..b1db4434 100644 --- a/src/Crypto/Makefile.inc +++ b/src/Crypto/Makefile.inc @@ -1,9 +1,9 @@ TC_ASFLAGS = -Xvc -Ox -VC_YASMFLAGS = -Xvc -p gas -D WINABI +VC_YASMFLAGS = -Xvc -D WINABI !if "$(TC_ARCH)" == "x86" TC_ASFLAGS = $(TC_ASFLAGS) -f win32 --prefix _ -D MS_STDCALL -D DLL_EXPORT -VC_YASMFLAGS = $(VC_YASMFLAGS) -f win32 +VC_YASMFLAGS = $(VC_YASMFLAGS) -f win32 -D MS_STDCALL !else TC_ASFLAGS = $(TC_ASFLAGS) -f win64 VC_YASMFLAGS = $(VC_YASMFLAGS) -f win64 @@ -21,11 +21,34 @@ TC_ASM_ERR_LOG = ..\Driver\build_errors_asm.log nasm.exe $(TC_ASFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Aes_hw_cpu.lst" Aes_hw_cpu.asm 2>$(TC_ASM_ERR_LOG) "$(OBJ_PATH)\$(O)\Twofish_$(TC_ARCH).obj": Twofish_$(TC_ARCH).S - yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Twofish_$(TC_ARCH).lst" Twofish_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG) + yasm.exe $(VC_YASMFLAGS) -p gas -o "$@" -l "$(OBJ_PATH)\$(O)\Twofish_$(TC_ARCH).lst" Twofish_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG) "$(OBJ_PATH)\$(O)\Camellia_$(TC_ARCH).obj": Camellia_$(TC_ARCH).S - yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Camellia_$(TC_ARCH).lst" Camellia_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG) + yasm.exe $(VC_YASMFLAGS) -p gas -o "$@" -l "$(OBJ_PATH)\$(O)\Camellia_$(TC_ARCH).lst" Camellia_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG) "$(OBJ_PATH)\$(O)\Camellia_aesni_$(TC_ARCH).obj": Camellia_aesni_$(TC_ARCH).S - yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Camellia_aesni_$(TC_ARCH).lst" Camellia_aesni_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG) + yasm.exe $(VC_YASMFLAGS) -p gas -o "$@" -l "$(OBJ_PATH)\$(O)\Camellia_aesni_$(TC_ARCH).lst" Camellia_aesni_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG) +"$(OBJ_PATH)\$(O)\sha256-$(TC_ARCH)-nayuki.obj": sha256-$(TC_ARCH)-nayuki.S + yasm.exe $(VC_YASMFLAGS) -p gas -o "$@" -l "$(OBJ_PATH)\$(O)\sha256-$(TC_ARCH)-nayuki.lst" sha256-$(TC_ARCH)-nayuki.S 2>$(TC_ASM_ERR_LOG) + +"$(OBJ_PATH)\$(O)\sha512-$(TC_ARCH)-nayuki.obj": sha512-$(TC_ARCH)-nayuki.S + yasm.exe $(VC_YASMFLAGS) -p gas -o "$@" -l "$(OBJ_PATH)\$(O)\sha512-$(TC_ARCH)-nayuki.lst" sha512-$(TC_ARCH)-nayuki.S 2>$(TC_ASM_ERR_LOG) + +"$(OBJ_PATH)\$(O)\sha512_avx1_$(TC_ARCH).obj": sha512_avx1_$(TC_ARCH).asm + yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha512_avx1_$(TC_ARCH).lst" sha512_avx1_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG) + +"$(OBJ_PATH)\$(O)\sha512_avx2_$(TC_ARCH).obj": sha512_avx2_$(TC_ARCH).asm + yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha512_avx2_$(TC_ARCH).lst" sha512_avx2_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG) + +"$(OBJ_PATH)\$(O)\sha512_sse4_$(TC_ARCH).obj": sha512_sse4_$(TC_ARCH).asm + yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha512_sse4_$(TC_ARCH).lst" sha512_sse4_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG) + +"$(OBJ_PATH)\$(O)\sha256_avx1_$(TC_ARCH).obj": sha256_avx1_$(TC_ARCH).asm + yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha256_avx1_$(TC_ARCH).lst" sha256_avx1_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG) + +"$(OBJ_PATH)\$(O)\sha256_avx2_$(TC_ARCH).obj": sha256_avx2_$(TC_ARCH).asm + yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha256_avx2_$(TC_ARCH).lst" sha256_avx2_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG) + +"$(OBJ_PATH)\$(O)\sha256_sse4_$(TC_ARCH).obj": sha256_sse4_$(TC_ARCH).asm + yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha256_sse4_$(TC_ARCH).lst" sha256_sse4_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG) diff --git a/src/Crypto/Sha2.c b/src/Crypto/Sha2.c index 9dbb529f..05da532e 100644 --- a/src/Crypto/Sha2.c +++ b/src/Crypto/Sha2.c @@ -1,767 +1,860 @@ /* - --------------------------------------------------------------------------- - Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software is allowed (with or without - changes) provided that: - - 1. source code distributions include the above copyright notice, this - list of conditions and the following disclaimer; - - 2. binary distributions include the above copyright notice, this list - of conditions and the following disclaimer in their documentation; - - 3. the name of the copyright holder is not used to endorse products - built using this software without specific written permission. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 01/08/2005 - - This is a byte oriented version of SHA2 that operates on arrays of bytes - stored in memory. This code implements sha256, sha384 and sha512 but the - latter two functions rely on efficient 64-bit integer operations that - may not be very efficient on 32-bit machines - - The sha256 functions use a type 'sha256_ctx' to hold details of the - current hash state and uses the following three calls: - - void sha256_begin(sha256_ctx ctx[1]) - void sha256_hash(const unsigned char data[], - unsigned long len, sha256_ctx ctx[1]) - void sha_end1(unsigned char hval[], sha256_ctx ctx[1]) - - The first subroutine initialises a hash computation by setting up the - context in the sha256_ctx context. The second subroutine hashes 8-bit - bytes from array data[] into the hash state withinh sha256_ctx context, - the number of bytes to be hashed being given by the the unsigned long - integer len. The third subroutine completes the hash calculation and - places the resulting digest value in the array of 8-bit bytes hval[]. - - The sha384 and sha512 functions are similar and use the interfaces: - - void sha384_begin(sha384_ctx ctx[1]); - void sha384_hash(const unsigned char data[], - unsigned long len, sha384_ctx ctx[1]); - void sha384_end(unsigned char hval[], sha384_ctx ctx[1]); - - void sha512_begin(sha512_ctx ctx[1]); - void sha512_hash(const unsigned char data[], - unsigned long len, sha512_ctx ctx[1]); - void sha512_end(unsigned char hval[], sha512_ctx ctx[1]); - - In addition there is a function sha2 that can be used to call all these - functions using a call with a hash length parameter as follows: - - int sha2_begin(unsigned long len, sha2_ctx ctx[1]); - void sha2_hash(const unsigned char data[], - unsigned long len, sha2_ctx ctx[1]); - void sha2_end(unsigned char hval[], sha2_ctx ctx[1]); - - My thanks to Erik Andersen for testing this code - on big-endian systems and for his assistance with corrections +This code is written by kerukuro for cppcrypto library (http://cppcrypto.sourceforge.net/) +and released into public domain. */ +/* Modified for VeraCrypt with speed optimization for C implementation */ + +#include "Sha2.h" #include "Common/Endian.h" -#include "Common/Tcdefs.h" +#include "Crypto/cpu.h" #include "Crypto/misc.h" -#define PLATFORM_BYTE_ORDER BYTE_ORDER -#define IS_LITTLE_ENDIAN LITTLE_ENDIAN - -#if 0 -#define UNROLL_SHA2 /* for SHA2 loop unroll */ +#ifdef _UEFI +#define NO_OPTIMIZED_VERSIONS #endif -#if !defined(_UEFI) -#include /* for memcpy() etc. */ -#endif // !defined(_UEFI) - -#include "Sha2.h" +#ifndef NO_OPTIMIZED_VERSIONS #if defined(__cplusplus) extern "C" { #endif - -#if defined( _MSC_VER ) && ( _MSC_VER > 800 ) && !defined(_UEFI) -#pragma intrinsic(memcpy) +#if CRYPTOPP_BOOL_X64 + void sha512_rorx(const void* M, void* D, uint_64t l); + void sha512_sse4(const void* M, uint_64t D[8], uint_64t l); + void sha512_avx(const void* M, void* D, uint_64t l); #endif - -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -#define SWAP_BYTES -#else -#undef SWAP_BYTES + +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64 + void sha512_compress_nayuki(uint_64t state[8], const uint_8t block[128]); #endif - -#if 0 - -#define ch(x,y,z) (((x) & (y)) ^ (~(x) & (z))) -#define maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) - -#else /* Thanks to Rich Schroeppel and Colin Plumb for the following */ - -#define ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) -#define maj(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y)))) - +#if defined(__cplusplus) +} #endif -/* round transforms for SHA256 and SHA512 compression functions */ - -#define vf(n,i) v[(n - i) & 7] - -#define hf(i) (p[i & 15] += \ - g_1(p[(i + 14) & 15]) + p[(i + 9) & 15] + g_0(p[(i + 1) & 15])) - -#define v_cycle(i,j) \ - vf(7,i) += (j ? hf(i) : p[i]) + k_0[i+j] \ - + s_1(vf(4,i)) + ch(vf(4,i),vf(5,i),vf(6,i)); \ - vf(3,i) += vf(7,i); \ - vf(7,i) += s_0(vf(0,i))+ maj(vf(0,i),vf(1,i),vf(2,i)) - -#if defined(SHA_224) || defined(SHA_256) - -#define SHA256_MASK (SHA256_BLOCK_SIZE - 1) +#endif -#if defined(SWAP_BYTES) -#define bsw_32(p,n) \ - { int _i = (n); while(_i--) ((uint_32t*)p)[_i] = bswap_32(((uint_32t*)p)[_i]); } -#else -#define bsw_32(p,n) -#endif - -#define s_0(x) (rotr32((x), 2) ^ rotr32((x), 13) ^ rotr32((x), 22)) -#define s_1(x) (rotr32((x), 6) ^ rotr32((x), 11) ^ rotr32((x), 25)) -#define g_0(x) (rotr32((x), 7) ^ rotr32((x), 18) ^ ((x) >> 3)) -#define g_1(x) (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10)) -#define k_0 k256 - -/* rotated SHA256 round definition. Rather than swapping variables as in */ -/* FIPS-180, different variables are 'rotated' on each round, returning */ -/* to their starting positions every eight rounds */ - -#define q(n) v##n - -#define one_cycle(a,b,c,d,e,f,g,h,k,w) \ - q(h) += s_1(q(e)) + ch(q(e), q(f), q(g)) + k + w; \ - q(d) += q(h); q(h) += s_0(q(a)) + maj(q(a), q(b), q(c)) - -/* SHA256 mixing data */ - -const uint_32t k256[64] = -{ 0x428a2f98ul, 0x71374491ul, 0xb5c0fbcful, 0xe9b5dba5ul, - 0x3956c25bul, 0x59f111f1ul, 0x923f82a4ul, 0xab1c5ed5ul, - 0xd807aa98ul, 0x12835b01ul, 0x243185beul, 0x550c7dc3ul, - 0x72be5d74ul, 0x80deb1feul, 0x9bdc06a7ul, 0xc19bf174ul, - 0xe49b69c1ul, 0xefbe4786ul, 0x0fc19dc6ul, 0x240ca1ccul, - 0x2de92c6ful, 0x4a7484aaul, 0x5cb0a9dcul, 0x76f988daul, - 0x983e5152ul, 0xa831c66dul, 0xb00327c8ul, 0xbf597fc7ul, - 0xc6e00bf3ul, 0xd5a79147ul, 0x06ca6351ul, 0x14292967ul, - 0x27b70a85ul, 0x2e1b2138ul, 0x4d2c6dfcul, 0x53380d13ul, - 0x650a7354ul, 0x766a0abbul, 0x81c2c92eul, 0x92722c85ul, - 0xa2bfe8a1ul, 0xa81a664bul, 0xc24b8b70ul, 0xc76c51a3ul, - 0xd192e819ul, 0xd6990624ul, 0xf40e3585ul, 0x106aa070ul, - 0x19a4c116ul, 0x1e376c08ul, 0x2748774cul, 0x34b0bcb5ul, - 0x391c0cb3ul, 0x4ed8aa4aul, 0x5b9cca4ful, 0x682e6ff3ul, - 0x748f82eeul, 0x78a5636ful, 0x84c87814ul, 0x8cc70208ul, - 0x90befffaul, 0xa4506cebul, 0xbef9a3f7ul, 0xc67178f2ul, +typedef void (*transformFn)(sha512_ctx* ctx, void* m, uint_64t num_blks); + +transformFn transfunc = NULL; + +static const uint_64t K[80] = { + 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, + 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, + 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694, + 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, + 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, + 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4, + 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70, + 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, + 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b, + 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30, + 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, + 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, + 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec, + 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b, + 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, + 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b, + 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, + 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 }; -/* Compile 64 bytes of hash data into SHA256 digest value */ -/* NOTE: this routine assumes that the byte order in the */ -/* ctx->wbuf[] at this point is such that low address bytes */ -/* in the ORIGINAL byte stream will go into the high end of */ -/* words on BOTH big and little endian systems */ -VOID_RETURN sha256_compile(sha256_ctx ctx[1]) +#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y)))) +#define sum0(x) (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39)) +#define sum1(x) (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41)) +#define sigma0(x) (rotr64((x), 1) ^ rotr64((x), 8) ^ ((x) >> 7)) +#define sigma1(x) (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >> 6)) + +#define WU(j) (W[j & 15] += sigma1(W[(j + 14) & 15]) + W[(j + 9) & 15] + sigma0(W[(j + 1) & 15])) + +#define COMPRESS_ROUND(i, j, K) \ + T1 = h + sum1(e) + Ch(e, f, g) + K[i + j] + (i? WU(j): W[j]); \ + T2 = sum0(a) + Maj(a, b, c); \ + h = g; \ + g = f; \ + f = e; \ + e = d + T1; \ + d = c; \ + c = b; \ + b = a; \ + a = T1 + T2; + +void StdTransform(sha512_ctx* ctx, void* mp, uint_64t num_blks) { -#if !defined(UNROLL_SHA2) - - uint_32t j, *p = ctx->wbuf, v[8]; - - memcpy(v, ctx->hash, 8 * sizeof(uint_32t)); - - for(j = 0; j < 64; j += 16) - { - v_cycle( 0, j); v_cycle( 1, j); - v_cycle( 2, j); v_cycle( 3, j); - v_cycle( 4, j); v_cycle( 5, j); - v_cycle( 6, j); v_cycle( 7, j); - v_cycle( 8, j); v_cycle( 9, j); - v_cycle(10, j); v_cycle(11, j); - v_cycle(12, j); v_cycle(13, j); - v_cycle(14, j); v_cycle(15, j); - } - - ctx->hash[0] += v[0]; ctx->hash[1] += v[1]; - ctx->hash[2] += v[2]; ctx->hash[3] += v[3]; - ctx->hash[4] += v[4]; ctx->hash[5] += v[5]; - ctx->hash[6] += v[6]; ctx->hash[7] += v[7]; + uint_64t blk; + for (blk = 0; blk < num_blks; blk++) + { + uint_64t W[16]; + uint_64t a,b,c,d,e,f,g,h; + uint_64t T1, T2; + int i; +#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG) + int j; +#endif + for (i = 0; i < 128 / 8; i++) + { + W[i] = bswap_64((((const uint_64t*)(mp))[blk * 16 + i])); + } + + a = ctx->hash[0]; + b = ctx->hash[1]; + c = ctx->hash[2]; + d = ctx->hash[3]; + e = ctx->hash[4]; + f = ctx->hash[5]; + g = ctx->hash[6]; + h = ctx->hash[7]; + + for (i = 0; i <= 79; i+=16) + { +#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG) + for (j = 0; j < 16; j++) + { + COMPRESS_ROUND(i, j, K); + } #else - - uint_32t *p = ctx->wbuf,v0,v1,v2,v3,v4,v5,v6,v7; - - v0 = ctx->hash[0]; v1 = ctx->hash[1]; - v2 = ctx->hash[2]; v3 = ctx->hash[3]; - v4 = ctx->hash[4]; v5 = ctx->hash[5]; - v6 = ctx->hash[6]; v7 = ctx->hash[7]; - - one_cycle(0,1,2,3,4,5,6,7,k256[ 0],p[ 0]); - one_cycle(7,0,1,2,3,4,5,6,k256[ 1],p[ 1]); - one_cycle(6,7,0,1,2,3,4,5,k256[ 2],p[ 2]); - one_cycle(5,6,7,0,1,2,3,4,k256[ 3],p[ 3]); - one_cycle(4,5,6,7,0,1,2,3,k256[ 4],p[ 4]); - one_cycle(3,4,5,6,7,0,1,2,k256[ 5],p[ 5]); - one_cycle(2,3,4,5,6,7,0,1,k256[ 6],p[ 6]); - one_cycle(1,2,3,4,5,6,7,0,k256[ 7],p[ 7]); - one_cycle(0,1,2,3,4,5,6,7,k256[ 8],p[ 8]); - one_cycle(7,0,1,2,3,4,5,6,k256[ 9],p[ 9]); - one_cycle(6,7,0,1,2,3,4,5,k256[10],p[10]); - one_cycle(5,6,7,0,1,2,3,4,k256[11],p[11]); - one_cycle(4,5,6,7,0,1,2,3,k256[12],p[12]); - one_cycle(3,4,5,6,7,0,1,2,k256[13],p[13]); - one_cycle(2,3,4,5,6,7,0,1,k256[14],p[14]); - one_cycle(1,2,3,4,5,6,7,0,k256[15],p[15]); - - one_cycle(0,1,2,3,4,5,6,7,k256[16],hf( 0)); - one_cycle(7,0,1,2,3,4,5,6,k256[17],hf( 1)); - one_cycle(6,7,0,1,2,3,4,5,k256[18],hf( 2)); - one_cycle(5,6,7,0,1,2,3,4,k256[19],hf( 3)); - one_cycle(4,5,6,7,0,1,2,3,k256[20],hf( 4)); - one_cycle(3,4,5,6,7,0,1,2,k256[21],hf( 5)); - one_cycle(2,3,4,5,6,7,0,1,k256[22],hf( 6)); - one_cycle(1,2,3,4,5,6,7,0,k256[23],hf( 7)); - one_cycle(0,1,2,3,4,5,6,7,k256[24],hf( 8)); - one_cycle(7,0,1,2,3,4,5,6,k256[25],hf( 9)); - one_cycle(6,7,0,1,2,3,4,5,k256[26],hf(10)); - one_cycle(5,6,7,0,1,2,3,4,k256[27],hf(11)); - one_cycle(4,5,6,7,0,1,2,3,k256[28],hf(12)); - one_cycle(3,4,5,6,7,0,1,2,k256[29],hf(13)); - one_cycle(2,3,4,5,6,7,0,1,k256[30],hf(14)); - one_cycle(1,2,3,4,5,6,7,0,k256[31],hf(15)); - - one_cycle(0,1,2,3,4,5,6,7,k256[32],hf( 0)); - one_cycle(7,0,1,2,3,4,5,6,k256[33],hf( 1)); - one_cycle(6,7,0,1,2,3,4,5,k256[34],hf( 2)); - one_cycle(5,6,7,0,1,2,3,4,k256[35],hf( 3)); - one_cycle(4,5,6,7,0,1,2,3,k256[36],hf( 4)); - one_cycle(3,4,5,6,7,0,1,2,k256[37],hf( 5)); - one_cycle(2,3,4,5,6,7,0,1,k256[38],hf( 6)); - one_cycle(1,2,3,4,5,6,7,0,k256[39],hf( 7)); - one_cycle(0,1,2,3,4,5,6,7,k256[40],hf( 8)); - one_cycle(7,0,1,2,3,4,5,6,k256[41],hf( 9)); - one_cycle(6,7,0,1,2,3,4,5,k256[42],hf(10)); - one_cycle(5,6,7,0,1,2,3,4,k256[43],hf(11)); - one_cycle(4,5,6,7,0,1,2,3,k256[44],hf(12)); - one_cycle(3,4,5,6,7,0,1,2,k256[45],hf(13)); - one_cycle(2,3,4,5,6,7,0,1,k256[46],hf(14)); - one_cycle(1,2,3,4,5,6,7,0,k256[47],hf(15)); - - one_cycle(0,1,2,3,4,5,6,7,k256[48],hf( 0)); - one_cycle(7,0,1,2,3,4,5,6,k256[49],hf( 1)); - one_cycle(6,7,0,1,2,3,4,5,k256[50],hf( 2)); - one_cycle(5,6,7,0,1,2,3,4,k256[51],hf( 3)); - one_cycle(4,5,6,7,0,1,2,3,k256[52],hf( 4)); - one_cycle(3,4,5,6,7,0,1,2,k256[53],hf( 5)); - one_cycle(2,3,4,5,6,7,0,1,k256[54],hf( 6)); - one_cycle(1,2,3,4,5,6,7,0,k256[55],hf( 7)); - one_cycle(0,1,2,3,4,5,6,7,k256[56],hf( 8)); - one_cycle(7,0,1,2,3,4,5,6,k256[57],hf( 9)); - one_cycle(6,7,0,1,2,3,4,5,k256[58],hf(10)); - one_cycle(5,6,7,0,1,2,3,4,k256[59],hf(11)); - one_cycle(4,5,6,7,0,1,2,3,k256[60],hf(12)); - one_cycle(3,4,5,6,7,0,1,2,k256[61],hf(13)); - one_cycle(2,3,4,5,6,7,0,1,k256[62],hf(14)); - one_cycle(1,2,3,4,5,6,7,0,k256[63],hf(15)); - - ctx->hash[0] += v0; ctx->hash[1] += v1; - ctx->hash[2] += v2; ctx->hash[3] += v3; - ctx->hash[4] += v4; ctx->hash[5] += v5; - ctx->hash[6] += v6; ctx->hash[7] += v7; + COMPRESS_ROUND(i, 0, K); + COMPRESS_ROUND(i, 1, K); + COMPRESS_ROUND(i , 2, K); + COMPRESS_ROUND(i, 3, K); + COMPRESS_ROUND(i, 4, K); + COMPRESS_ROUND(i, 5, K); + COMPRESS_ROUND(i, 6, K); + COMPRESS_ROUND(i, 7, K); + COMPRESS_ROUND(i, 8, K); + COMPRESS_ROUND(i, 9, K); + COMPRESS_ROUND(i, 10, K); + COMPRESS_ROUND(i, 11, K); + COMPRESS_ROUND(i, 12, K); + COMPRESS_ROUND(i, 13, K); + COMPRESS_ROUND(i, 14, K); + COMPRESS_ROUND(i, 15, K); #endif + } + ctx->hash[0] += a; + ctx->hash[1] += b; + ctx->hash[2] += c; + ctx->hash[3] += d; + ctx->hash[4] += e; + ctx->hash[5] += f; + ctx->hash[6] += g; + ctx->hash[7] += h; + } } -/* SHA256 hash data in an array of bytes into hash buffer */ -/* and call the hash_compile function as required. */ - -VOID_RETURN sha256_hash(const unsigned char data[], unsigned long len, sha256_ctx ctx[1]) -{ uint_32t pos = (uint_32t)(ctx->count[0] & SHA256_MASK), - space = SHA256_BLOCK_SIZE - pos; - const unsigned char *sp = data; - - if((ctx->count[0] += len) < len) - ++(ctx->count[1]); - - while(len >= space) /* tranfer whole blocks while possible */ - { - memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space); - sp += space; len -= space; space = SHA256_BLOCK_SIZE; pos = 0; - bsw_32(ctx->wbuf, SHA256_BLOCK_SIZE >> 2) - sha256_compile(ctx); - } - - memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len); -} - -/* SHA256 Final padding and digest calculation */ - -static void sha_end1(unsigned char hval[], sha256_ctx ctx[1], const unsigned int hlen) -{ uint_32t i = (uint_32t)(ctx->count[0] & SHA256_MASK); - - /* put bytes in the buffer in an order in which references to */ - /* 32-bit words will put bytes with lower addresses into the */ - /* top of 32 bit words on BOTH big and little endian machines */ - bsw_32(ctx->wbuf, (i + 3) >> 2) - - /* we now need to mask valid bytes and add the padding which is */ - /* a single 1 bit and as many zero bits as necessary. Note that */ - /* we can always add the first padding byte here because the */ - /* buffer always has at least one empty slot */ - ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3); - ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3); - - /* we need 9 or more empty positions, one for the padding byte */ - /* (above) and eight for the length count. If there is not */ - /* enough space pad and empty the buffer */ - if(i > SHA256_BLOCK_SIZE - 9) - { - if(i < 60) ctx->wbuf[15] = 0; - sha256_compile(ctx); - i = 0; - } - else /* compute a word index for the empty buffer positions */ - i = (i >> 2) + 1; - - while(i < 14) /* and zero pad all but last two positions */ - ctx->wbuf[i++] = 0; - - /* the following 32-bit length fields are assembled in the */ - /* wrong byte order on little endian machines but this is */ - /* corrected later since they are only ever used as 32-bit */ - /* word values. */ - ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29); - ctx->wbuf[15] = ctx->count[0] << 3; - sha256_compile(ctx); - - /* extract the hash value as bytes in case the hash buffer is */ - /* mislaigned for 32-bit words */ - for(i = 0; i < hlen; ++i) - hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3))); -} - -#endif - -#if defined(SHA_224) +#ifndef NO_OPTIMIZED_VERSIONS -const uint_32t i224[8] = +#if CRYPTOPP_BOOL_X64 +void Avx2Transform(sha512_ctx* ctx, void* mp, uint_64t num_blks) { - 0xc1059ed8ul, 0x367cd507ul, 0x3070dd17ul, 0xf70e5939ul, - 0xffc00b31ul, 0x68581511ul, 0x64f98fa7ul, 0xbefa4fa4ul -}; + if (num_blks > 1) + sha512_rorx(mp, ctx->hash, num_blks); + else + sha512_sse4(mp, ctx->hash, num_blks); +} -VOID_RETURN sha224_begin(sha224_ctx ctx[1]) +void AvxTransform(sha512_ctx* ctx, void* mp, uint_64t num_blks) { - ctx->count[0] = ctx->count[1] = 0; - memcpy(ctx->hash, i224, 8 * sizeof(uint_32t)); + if (num_blks > 1) + sha512_avx(mp, ctx->hash, num_blks); + else + sha512_sse4(mp, ctx->hash, num_blks); } -VOID_RETURN sha224_end(unsigned char hval[], sha224_ctx ctx[1]) +void SSE4Transform(sha512_ctx* ctx, void* mp, uint_64t num_blks) { - sha_end1(hval, ctx, SHA224_DIGEST_SIZE); + sha512_sse4(mp, ctx->hash, num_blks); } +#endif -VOID_RETURN sha224(unsigned char hval[], const unsigned char data[], unsigned long len) -{ sha224_ctx cx[1]; +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64 - sha224_begin(cx); - sha224_hash(data, len, cx); - sha_end1(hval, cx, SHA224_DIGEST_SIZE); +void SSE2Transform(sha512_ctx* ctx, void* mp, uint_64t num_blks) +{ + uint_64t i; + for (i = 0; i < num_blks; i++) + sha512_compress_nayuki(ctx->hash, (uint_8t*)mp + i * 128); } #endif -#if defined(SHA_256) +#endif // NO_OPTIMIZED_VERSIONS -const uint_32t i256[8] = +void sha512_begin(sha512_ctx* ctx) { - 0x6a09e667ul, 0xbb67ae85ul, 0x3c6ef372ul, 0xa54ff53aul, - 0x510e527ful, 0x9b05688cul, 0x1f83d9abul, 0x5be0cd19ul -}; + ctx->hash[0] = 0x6a09e667f3bcc908; + ctx->hash[1] = 0xbb67ae8584caa73b; + ctx->hash[2] = 0x3c6ef372fe94f82b; + ctx->hash[3] = 0xa54ff53a5f1d36f1; + ctx->hash[4] = 0x510e527fade682d1; + ctx->hash[5] = 0x9b05688c2b3e6c1f; + ctx->hash[6] = 0x1f83d9abfb41bd6b; + ctx->hash[7] = 0x5be0cd19137e2179; + ctx->count[0] = 0; + ctx->count[1] = 0; + + if (!transfunc) + { +#ifndef NO_OPTIMIZED_VERSIONS +#if CRYPTOPP_BOOL_X64 + if (g_isIntel&& HasSAVX2() && HasSBMI2()) + transfunc = Avx2Transform; + else if (g_isIntel && HasSAVX()) + { + transfunc = AvxTransform; + } + else if (HasSSE41()) + { + transfunc = SSE4Transform; + } + else +#endif -VOID_RETURN sha256_begin(sha256_ctx ctx[1]) +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64 + if (HasSSE2() && HasMMX()) + transfunc = SSE2Transform; + else +#endif + +#endif + transfunc = StdTransform; + } +} + +void sha512_end(unsigned char * result, sha512_ctx* ctx) { - ctx->count[0] = ctx->count[1] = 0; - memcpy(ctx->hash, i256, 8 * sizeof(uint_32t)); + int i; + uint_64t mlen, pos = ctx->count[0]; + uint_8t* m = (uint_8t*) ctx->wbuf; + m[pos++] = 0x80; + if (pos > 112) + { + memset(m + pos, 0, (size_t) (128 - pos)); + transfunc(ctx, m, 1); + pos = 0; + } + memset(m + pos, 0, (size_t) (128 - pos)); + mlen = bswap_64(ctx->count[1]); + memcpy(m + (128 - 8), &mlen, 64 / 8); + transfunc(ctx, m, 1); + for (i = 0; i < 8; i++) + { + ctx->hash[i] = bswap_64(ctx->hash[i]); + } + memcpy(result, ctx->hash, 64); } -VOID_RETURN sha256_end(unsigned char hval[], sha256_ctx ctx[1]) +void sha512_hash(const unsigned char * data, uint_64t len, sha512_ctx *ctx) { - sha_end1(hval, ctx, SHA256_DIGEST_SIZE); + uint_64t pos = ctx->count[0]; + uint_64t total = ctx->count[1]; + uint_8t* m = (uint_8t*) ctx->wbuf; + if (pos && pos + len >= 128) + { + memcpy(m + pos, data, (size_t) (128 - pos)); + transfunc(ctx, m, 1); + len -= 128 - pos; + total += (128 - pos) * 8; + data += 128 - pos; + pos = 0; + } + if (len >= 128) + { + uint_64t blocks = len / 128; + uint_64t bytes = blocks * 128; + transfunc(ctx, (void*)data, blocks); + len -= bytes; + total += (bytes)* 8; + data += bytes; + } + memcpy(m+pos, data, (size_t) (len)); + pos += len; + total += len * 8; + ctx->count[0] = pos; + ctx->count[1] = total; } -VOID_RETURN sha256(unsigned char hval[], const unsigned char data[], unsigned long len) -{ sha256_ctx cx[1]; +void sha512(unsigned char * result, const unsigned char* source, uint_64t sourceLen) +{ + sha512_ctx ctx; - sha256_begin(cx); - sha256_hash(data, len, cx); - sha_end1(hval, cx, SHA256_DIGEST_SIZE); + sha512_begin(&ctx); + sha512_hash(source, sourceLen, &ctx); + sha512_end(result, &ctx); } -#endif +///////////////////////////// -#if defined(SHA_384) || defined(SHA_512) +#ifndef NO_OPTIMIZED_VERSIONS -#define SHA512_MASK (SHA512_BLOCK_SIZE - 1) +#if defined(__cplusplus) +extern "C" +{ +#endif -#if defined(SWAP_BYTES) -#define bsw_64(p,n) \ - { int _i = (n); while(_i--) ((uint_64t*)p)[_i] = bswap_64(((uint_64t*)p)[_i]); } -#else -#define bsw_64(p,n) +#if CRYPTOPP_BOOL_X64 + void sha256_sse4(void *input_data, uint_32t digest[8], uint_64t num_blks); + void sha256_rorx(void *input_data, uint_32t digest[8], uint_64t num_blks); + void sha256_avx(void *input_data, uint_32t digest[8], uint_64t num_blks); #endif -/* SHA512 mixing function definitions */ +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 + void sha256_compress_nayuki(uint_32t state[8], const uint_8t block[64]); +#endif -#ifdef s_0 -# undef s_0 -# undef s_1 -# undef g_0 -# undef g_1 -# undef k_0 +#if defined(__cplusplus) +} #endif -#define s_0(x) (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39)) -#define s_1(x) (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41)) -#define g_0(x) (rotr64((x), 1) ^ rotr64((x), 8) ^ ((x) >> 7)) -#define g_1(x) (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >> 6)) -#define k_0 k512 +#endif -/* SHA384/SHA512 mixing data */ +CRYPTOPP_ALIGN_DATA(16) uint_32t SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + }; + +#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE)) + +#ifdef _MSC_VER +# pragma warning(disable: 4100 4731) +#endif -const uint_64t k512[80] = +static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(uint_32t *state, const uint_32t *data, size_t len) { - li_64(428a2f98d728ae22), li_64(7137449123ef65cd), - li_64(b5c0fbcfec4d3b2f), li_64(e9b5dba58189dbbc), - li_64(3956c25bf348b538), li_64(59f111f1b605d019), - li_64(923f82a4af194f9b), li_64(ab1c5ed5da6d8118), - li_64(d807aa98a3030242), li_64(12835b0145706fbe), - li_64(243185be4ee4b28c), li_64(550c7dc3d5ffb4e2), - li_64(72be5d74f27b896f), li_64(80deb1fe3b1696b1), - li_64(9bdc06a725c71235), li_64(c19bf174cf692694), - li_64(e49b69c19ef14ad2), li_64(efbe4786384f25e3), - li_64(0fc19dc68b8cd5b5), li_64(240ca1cc77ac9c65), - li_64(2de92c6f592b0275), li_64(4a7484aa6ea6e483), - li_64(5cb0a9dcbd41fbd4), li_64(76f988da831153b5), - li_64(983e5152ee66dfab), li_64(a831c66d2db43210), - li_64(b00327c898fb213f), li_64(bf597fc7beef0ee4), - li_64(c6e00bf33da88fc2), li_64(d5a79147930aa725), - li_64(06ca6351e003826f), li_64(142929670a0e6e70), - li_64(27b70a8546d22ffc), li_64(2e1b21385c26c926), - li_64(4d2c6dfc5ac42aed), li_64(53380d139d95b3df), - li_64(650a73548baf63de), li_64(766a0abb3c77b2a8), - li_64(81c2c92e47edaee6), li_64(92722c851482353b), - li_64(a2bfe8a14cf10364), li_64(a81a664bbc423001), - li_64(c24b8b70d0f89791), li_64(c76c51a30654be30), - li_64(d192e819d6ef5218), li_64(d69906245565a910), - li_64(f40e35855771202a), li_64(106aa07032bbd1b8), - li_64(19a4c116b8d2d0c8), li_64(1e376c085141ab53), - li_64(2748774cdf8eeb99), li_64(34b0bcb5e19b48a8), - li_64(391c0cb3c5c95a63), li_64(4ed8aa4ae3418acb), - li_64(5b9cca4f7763e373), li_64(682e6ff3d6b2b8a3), - li_64(748f82ee5defb2fc), li_64(78a5636f43172f60), - li_64(84c87814a1f0ab72), li_64(8cc702081a6439ec), - li_64(90befffa23631e28), li_64(a4506cebde82bde9), - li_64(bef9a3f7b2c67915), li_64(c67178f2e372532b), - li_64(ca273eceea26619c), li_64(d186b8c721c0c207), - li_64(eada7dd6cde0eb1e), li_64(f57d4f7fee6ed178), - li_64(06f067aa72176fba), li_64(0a637dc5a2c898a6), - li_64(113f9804bef90dae), li_64(1b710b35131c471b), - li_64(28db77f523047d84), li_64(32caab7b40c72493), - li_64(3c9ebe0a15c9bebc), li_64(431d67c49c100d4c), - li_64(4cc5d4becb3e42b6), li_64(597f299cfc657e2a), - li_64(5fcb6fab3ad6faec), li_64(6c44198c4a475817) -}; + #define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ + #define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4] + #define G(i) H(i+1) + #define F(i) H(i+2) + #define E(i) H(i+3) + #define D(i) H(i+4) + #define C(i) H(i+5) + #define B(i) H(i+6) + #define A(i) H(i+7) + #define Wt(i) BASE+8*4+ASM_MOD(1024+15-(i),16)*4 + #define Wt_2(i) Wt((i)-2) + #define Wt_15(i) Wt((i)-15) + #define Wt_7(i) Wt((i)-7) + #define K_END [BASE+8*4+16*4+0*WORD_SZ] + #define STATE_SAVE [BASE+8*4+16*4+1*WORD_SZ] + #define DATA_SAVE [BASE+8*4+16*4+2*WORD_SZ] + #define DATA_END [BASE+8*4+16*4+3*WORD_SZ] + #define Kt(i) WORD_REG(si)+(i)*4 +#if CRYPTOPP_BOOL_X32 + #define BASE esp+8 +#elif CRYPTOPP_BOOL_X86 + #define BASE esp+4 +#elif defined(__GNUC__) + #define BASE r8 +#else + #define BASE rsp +#endif -/* Compile 128 bytes of hash data into SHA384/512 digest */ -/* NOTE: this routine assumes that the byte order in the */ -/* ctx->wbuf[] at this point is such that low address bytes */ -/* in the ORIGINAL byte stream will go into the high end of */ -/* words on BOTH big and little endian systems */ +#define RA0(i, edx, edi) \ + AS2( add edx, [Kt(i)] )\ + AS2( add edx, [Wt(i)] )\ + AS2( add edx, H(i) )\ + +#define RA1(i, edx, edi) + +#define RB0(i, edx, edi) + +#define RB1(i, edx, edi) \ + AS2( mov AS_REG_7d, [Wt_2(i)] )\ + AS2( mov edi, [Wt_15(i)])\ + AS2( mov ebx, AS_REG_7d )\ + AS2( shr AS_REG_7d, 10 )\ + AS2( ror ebx, 17 )\ + AS2( xor AS_REG_7d, ebx )\ + AS2( ror ebx, 2 )\ + AS2( xor ebx, AS_REG_7d )/* s1(W_t-2) */\ + AS2( add ebx, [Wt_7(i)])\ + AS2( mov AS_REG_7d, edi )\ + AS2( shr AS_REG_7d, 3 )\ + AS2( ror edi, 7 )\ + AS2( add ebx, [Wt(i)])/* s1(W_t-2) + W_t-7 + W_t-16 */\ + AS2( xor AS_REG_7d, edi )\ + AS2( add edx, [Kt(i)])\ + AS2( ror edi, 11 )\ + AS2( add edx, H(i) )\ + AS2( xor AS_REG_7d, edi )/* s0(W_t-15) */\ + AS2( add AS_REG_7d, ebx )/* W_t = s1(W_t-2) + W_t-7 + s0(W_t-15) W_t-16*/\ + AS2( mov [Wt(i)], AS_REG_7d)\ + AS2( add edx, AS_REG_7d )\ + +#define ROUND(i, r, eax, ecx, edi, edx)\ + /* in: edi = E */\ + /* unused: eax, ecx, temp: ebx, AS_REG_7d, out: edx = T1 */\ + AS2( mov edx, F(i) )\ + AS2( xor edx, G(i) )\ + AS2( and edx, edi )\ + AS2( xor edx, G(i) )/* Ch(E,F,G) = (G^(E&(F^G))) */\ + AS2( mov AS_REG_7d, edi )\ + AS2( ror edi, 6 )\ + AS2( ror AS_REG_7d, 25 )\ + RA##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\ + AS2( xor AS_REG_7d, edi )\ + AS2( ror edi, 5 )\ + AS2( xor AS_REG_7d, edi )/* S1(E) */\ + AS2( add edx, AS_REG_7d )/* T1 = S1(E) + Ch(E,F,G) + H + Wt + Kt */\ + RB##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\ + /* in: ecx = A, eax = B^C, edx = T1 */\ + /* unused: edx, temp: ebx, AS_REG_7d, out: eax = A, ecx = B^C, edx = E */\ + AS2( mov ebx, ecx )\ + AS2( xor ecx, B(i) )/* A^B */\ + AS2( and eax, ecx )\ + AS2( xor eax, B(i) )/* Maj(A,B,C) = B^((A^B)&(B^C) */\ + AS2( mov AS_REG_7d, ebx )\ + AS2( ror ebx, 2 )\ + AS2( add eax, edx )/* T1 + Maj(A,B,C) */\ + AS2( add edx, D(i) )\ + AS2( mov D(i), edx )\ + AS2( ror AS_REG_7d, 22 )\ + AS2( xor AS_REG_7d, ebx )\ + AS2( ror ebx, 11 )\ + AS2( xor AS_REG_7d, ebx )\ + AS2( add eax, AS_REG_7d )/* T1 + S0(A) + Maj(A,B,C) */\ + AS2( mov H(i), eax )\ + +// Unroll the use of CRYPTOPP_BOOL_X64 in assembler math. The GAS assembler on X32 (version 2.25) +// complains "Error: invalid operands (*ABS* and *UND* sections) for `*` and `-`" +#if CRYPTOPP_BOOL_X64 +#define SWAP_COPY(i) \ + AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\ + AS1( bswap WORD_REG(bx))\ + AS2( mov [Wt(i*2+1)], WORD_REG(bx)) +#else // X86 and X32 +#define SWAP_COPY(i) \ + AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\ + AS1( bswap WORD_REG(bx))\ + AS2( mov [Wt(i)], WORD_REG(bx)) +#endif -VOID_RETURN sha512_compile(sha512_ctx ctx[1]) -{ uint_64t v[8], *p = ctx->wbuf; - uint_32t j; -#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG) - uint_32t i; +#if defined(__GNUC__) + #if CRYPTOPP_BOOL_X64 + CRYPTOPP_ALIGN_DATA(16) byte workspace[LOCALS_SIZE] ; + #endif + __asm__ __volatile__ + ( + #if CRYPTOPP_BOOL_X64 + "lea %4, %%r8;" + #endif + INTEL_NOPREFIX #endif - memcpy(v, ctx->hash, 8 * sizeof(uint_64t)); +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 + #ifndef __GNUC__ + AS2( mov edi, [len]) + AS2( lea WORD_REG(si), [SHA256_K+48*4]) + #endif + #if !defined(_MSC_VER) || (_MSC_VER < 1400) + AS_PUSH_IF86(bx) + #endif + + AS_PUSH_IF86(bp) + AS2( mov ebx, esp) + AS2( and esp, -16) + AS2( sub WORD_REG(sp), LOCALS_SIZE) + AS_PUSH_IF86(bx) +#endif + AS2( mov STATE_SAVE, WORD_REG(cx)) + AS2( mov DATA_SAVE, WORD_REG(dx)) + AS2( lea WORD_REG(ax), [WORD_REG(di) + WORD_REG(dx)]) + AS2( mov DATA_END, WORD_REG(ax)) + AS2( mov K_END, WORD_REG(si)) + +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 + AS2( test edi, 1) + ASJ( jnz, 2, f) + AS1( dec DWORD PTR K_END) +#endif + AS2( movdqa xmm0, XMMWORD_PTR [WORD_REG(cx)+0*16]) + AS2( movdqa xmm1, XMMWORD_PTR [WORD_REG(cx)+1*16]) +#endif - for(j = 0; j < 80; j += 16) - { -#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG) - for (i = 0; i < 16; i++) - { - v_cycle( i, j); - } -#else - v_cycle( 0, j); v_cycle( 1, j); - v_cycle( 2, j); v_cycle( 3, j); - v_cycle( 4, j); v_cycle( 5, j); - v_cycle( 6, j); v_cycle( 7, j); - v_cycle( 8, j); v_cycle( 9, j); - v_cycle(10, j); v_cycle(11, j); - v_cycle(12, j); v_cycle(13, j); - v_cycle(14, j); v_cycle(15, j); -#endif - } - - ctx->hash[0] += v[0]; ctx->hash[1] += v[1]; - ctx->hash[2] += v[2]; ctx->hash[3] += v[3]; - ctx->hash[4] += v[4]; ctx->hash[5] += v[5]; - ctx->hash[6] += v[6]; ctx->hash[7] += v[7]; -} +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + ASJ( jmp, 0, f) +#endif + ASL(2) // non-SSE2 + AS2( mov esi, ecx) + AS2( lea edi, A(0)) + AS2( mov ecx, 8) +ATT_NOPREFIX + AS1( rep movsd) +INTEL_NOPREFIX + AS2( mov esi, K_END) + ASJ( jmp, 3, f) +#endif -/* Compile 128 bytes of hash data into SHA256 digest value */ -/* NOTE: this routine assumes that the byte order in the */ -/* ctx->wbuf[] at this point is in such an order that low */ -/* address bytes in the ORIGINAL byte stream placed in this */ -/* buffer will now go to the high end of words on BOTH big */ -/* and little endian systems */ - -VOID_RETURN sha512_hash(const unsigned char data[], unsigned long len, sha512_ctx ctx[1]) -{ uint_32t pos = (uint_32t)(ctx->count[0] & SHA512_MASK), - space = SHA512_BLOCK_SIZE - pos; - const unsigned char *sp = data; - - if((ctx->count[0] += len) < len) - ++(ctx->count[1]); - - while(len >= space) /* tranfer whole blocks while possible */ - { - memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space); - sp += space; len -= space; space = SHA512_BLOCK_SIZE; pos = 0; - bsw_64(ctx->wbuf, SHA512_BLOCK_SIZE >> 3); - sha512_compile(ctx); - } - - memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len); -} +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + ASL(0) + AS2( movdqa E(0), xmm1) + AS2( movdqa A(0), xmm0) +#endif +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 + ASL(3) +#endif + AS2( sub WORD_REG(si), 48*4) + SWAP_COPY(0) SWAP_COPY(1) SWAP_COPY(2) SWAP_COPY(3) + SWAP_COPY(4) SWAP_COPY(5) SWAP_COPY(6) SWAP_COPY(7) +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 + SWAP_COPY(8) SWAP_COPY(9) SWAP_COPY(10) SWAP_COPY(11) + SWAP_COPY(12) SWAP_COPY(13) SWAP_COPY(14) SWAP_COPY(15) +#endif + AS2( mov edi, E(0)) // E + AS2( mov eax, B(0)) // B + AS2( xor eax, C(0)) // B^C + AS2( mov ecx, A(0)) // A + + ROUND(0, 0, eax, ecx, edi, edx) + ROUND(1, 0, ecx, eax, edx, edi) + ROUND(2, 0, eax, ecx, edi, edx) + ROUND(3, 0, ecx, eax, edx, edi) + ROUND(4, 0, eax, ecx, edi, edx) + ROUND(5, 0, ecx, eax, edx, edi) + ROUND(6, 0, eax, ecx, edi, edx) + ROUND(7, 0, ecx, eax, edx, edi) + ROUND(8, 0, eax, ecx, edi, edx) + ROUND(9, 0, ecx, eax, edx, edi) + ROUND(10, 0, eax, ecx, edi, edx) + ROUND(11, 0, ecx, eax, edx, edi) + ROUND(12, 0, eax, ecx, edi, edx) + ROUND(13, 0, ecx, eax, edx, edi) + ROUND(14, 0, eax, ecx, edi, edx) + ROUND(15, 0, ecx, eax, edx, edi) + + ASL(1) + AS2(add WORD_REG(si), 4*16) + ROUND(0, 1, eax, ecx, edi, edx) + ROUND(1, 1, ecx, eax, edx, edi) + ROUND(2, 1, eax, ecx, edi, edx) + ROUND(3, 1, ecx, eax, edx, edi) + ROUND(4, 1, eax, ecx, edi, edx) + ROUND(5, 1, ecx, eax, edx, edi) + ROUND(6, 1, eax, ecx, edi, edx) + ROUND(7, 1, ecx, eax, edx, edi) + ROUND(8, 1, eax, ecx, edi, edx) + ROUND(9, 1, ecx, eax, edx, edi) + ROUND(10, 1, eax, ecx, edi, edx) + ROUND(11, 1, ecx, eax, edx, edi) + ROUND(12, 1, eax, ecx, edi, edx) + ROUND(13, 1, ecx, eax, edx, edi) + ROUND(14, 1, eax, ecx, edi, edx) + ROUND(15, 1, ecx, eax, edx, edi) + AS2( cmp WORD_REG(si), K_END) + ATT_NOPREFIX + ASJ( jb, 1, b) + INTEL_NOPREFIX + + AS2( mov WORD_REG(dx), DATA_SAVE) + AS2( add WORD_REG(dx), 64) + AS2( mov AS_REG_7, STATE_SAVE) + AS2( mov DATA_SAVE, WORD_REG(dx)) + +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 + AS2( test DWORD PTR K_END, 1) + ASJ( jz, 4, f) +#endif + AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_7+1*16]) + AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_7+0*16]) + AS2( paddd xmm1, E(0)) + AS2( paddd xmm0, A(0)) + AS2( movdqa [AS_REG_7+1*16], xmm1) + AS2( movdqa [AS_REG_7+0*16], xmm0) + AS2( cmp WORD_REG(dx), DATA_END) + ATT_NOPREFIX + ASJ( jb, 0, b) + INTEL_NOPREFIX +#endif -/* SHA384/512 Final padding and digest calculation */ - -static void sha_end2(unsigned char hval[], sha512_ctx ctx[1], const unsigned int hlen) -{ uint_32t i = (uint_32t)(ctx->count[0] & SHA512_MASK); - - /* put bytes in the buffer in an order in which references to */ - /* 32-bit words will put bytes with lower addresses into the */ - /* top of 32 bit words on BOTH big and little endian machines */ - bsw_64(ctx->wbuf, (i + 7) >> 3); - - /* we now need to mask valid bytes and add the padding which is */ - /* a single 1 bit and as many zero bits as necessary. Note that */ - /* we can always add the first padding byte here because the */ - /* buffer always has at least one empty slot */ - ctx->wbuf[i >> 3] &= li_64(ffffffffffffff00) << 8 * (~i & 7); - ctx->wbuf[i >> 3] |= li_64(0000000000000080) << 8 * (~i & 7); - - /* we need 17 or more empty byte positions, one for the padding */ - /* byte (above) and sixteen for the length count. If there is */ - /* not enough space pad and empty the buffer */ - if(i > SHA512_BLOCK_SIZE - 17) - { - if(i < 120) ctx->wbuf[15] = 0; - sha512_compile(ctx); - i = 0; - } - else - i = (i >> 3) + 1; - - while(i < 14) - ctx->wbuf[i++] = 0; - - /* the following 64-bit length fields are assembled in the */ - /* wrong byte order on little endian machines but this is */ - /* corrected later since they are only ever used as 64-bit */ - /* word values. */ - ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 61); - ctx->wbuf[15] = ctx->count[0] << 3; - sha512_compile(ctx); - - /* extract the hash value as bytes in case the hash buffer is */ - /* misaligned for 32-bit words */ - for(i = 0; i < hlen; ++i) - hval[i] = (unsigned char)(ctx->hash[i >> 3] >> (8 * (~i & 7))); -} +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + ASJ( jmp, 5, f) + ASL(4) // non-SSE2 +#endif + AS2( add [AS_REG_7+0*4], ecx) // A + AS2( add [AS_REG_7+4*4], edi) // E + AS2( mov eax, B(0)) + AS2( mov ebx, C(0)) + AS2( mov ecx, D(0)) + AS2( add [AS_REG_7+1*4], eax) + AS2( add [AS_REG_7+2*4], ebx) + AS2( add [AS_REG_7+3*4], ecx) + AS2( mov eax, F(0)) + AS2( mov ebx, G(0)) + AS2( mov ecx, H(0)) + AS2( add [AS_REG_7+5*4], eax) + AS2( add [AS_REG_7+6*4], ebx) + AS2( add [AS_REG_7+7*4], ecx) + AS2( mov ecx, AS_REG_7d) + AS2( cmp WORD_REG(dx), DATA_END) + ASJ( jb, 2, b) +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE + ASL(5) +#endif +#endif + AS_POP_IF86(sp) + AS_POP_IF86(bp) + #if !defined(_MSC_VER) || (_MSC_VER < 1400) + AS_POP_IF86(bx) + #endif + +#ifdef __GNUC__ + ATT_PREFIX + : + : "c" (state), "d" (data), "S" (SHA256_K+48), "D" (len) + #if CRYPTOPP_BOOL_X64 + , "m" (workspace[0]) + #endif + : "memory", "cc", "%eax" + #if CRYPTOPP_BOOL_X64 + , "%rbx", "%r8", "%r10" + #endif + ); #endif +} -#if defined(SHA_384) +#endif // (defined(CRYPTOPP_X86_ASM_AVAILABLE)) -/* SHA384 initialisation data */ +#undef sum0 +#undef sum1 +#undef sigma0 +#undef sigma1 -const uint_64t i384[80] = -{ - li_64(cbbb9d5dc1059ed8), li_64(629a292a367cd507), - li_64(9159015a3070dd17), li_64(152fecd8f70e5939), - li_64(67332667ffc00b31), li_64(8eb44a8768581511), - li_64(db0c2e0d64f98fa7), li_64(47b5481dbefa4fa4) -}; +#define sum0(x) (rotr32((x), 2) ^ rotr32((x), 13) ^ rotr32((x), 22)) +#define sum1(x) (rotr32((x), 6) ^ rotr32((x), 11) ^ rotr32((x), 25)) +#define sigma0(x) (rotr32((x), 7) ^ rotr32((x), 18) ^ ((x) >> 3)) +#define sigma1(x) (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10)) -VOID_RETURN sha384_begin(sha384_ctx ctx[1]) -{ - ctx->count[0] = ctx->count[1] = 0; - memcpy(ctx->hash, i384, 8 * sizeof(uint_64t)); -} -VOID_RETURN sha384_end(unsigned char hval[], sha384_ctx ctx[1]) -{ - sha_end2(hval, ctx, SHA384_DIGEST_SIZE); -} +typedef void (*sha256transformFn)(sha256_ctx* ctx, void* m, uint_64t num_blks); -VOID_RETURN sha384(unsigned char hval[], const unsigned char data[], unsigned long len) -{ sha384_ctx cx[1]; - - sha384_begin(cx); - sha384_hash(data, len, cx); - sha_end2(hval, cx, SHA384_DIGEST_SIZE); -} +sha256transformFn sha256transfunc = NULL; +void StdSha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks) +{ + uint_64t blk; + for (blk = 0; blk < num_blks; blk++) + { + uint_32t W[16]; + uint_32t a,b,c,d,e,f,g,h; + uint_32t T1, T2; + int i; +#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG) + int j; #endif -#if defined(SHA_512) - -/* SHA512 initialisation data */ + for (i = 0; i < 64 / 4; i++) + { + W[i] = bswap_32((((const uint_32t*)(mp))[blk * 16 + i])); + } + + a = ctx->hash[0]; + b = ctx->hash[1]; + c = ctx->hash[2]; + d = ctx->hash[3]; + e = ctx->hash[4]; + f = ctx->hash[5]; + g = ctx->hash[6]; + h = ctx->hash[7]; + + for (i = 0; i <= 63; i+=16) + { +#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG) + for (j = 0; j < 16; j++) + { + COMPRESS_ROUND(i, j, SHA256_K); + } +#else + COMPRESS_ROUND(i, 0, SHA256_K); + COMPRESS_ROUND(i, 1, SHA256_K); + COMPRESS_ROUND(i , 2, SHA256_K); + COMPRESS_ROUND(i, 3, SHA256_K); + COMPRESS_ROUND(i, 4, SHA256_K); + COMPRESS_ROUND(i, 5, SHA256_K); + COMPRESS_ROUND(i, 6, SHA256_K); + COMPRESS_ROUND(i, 7, SHA256_K); + COMPRESS_ROUND(i, 8, SHA256_K); + COMPRESS_ROUND(i, 9, SHA256_K); + COMPRESS_ROUND(i, 10, SHA256_K); + COMPRESS_ROUND(i, 11, SHA256_K); + COMPRESS_ROUND(i, 12, SHA256_K); + COMPRESS_ROUND(i, 13, SHA256_K); + COMPRESS_ROUND(i, 14, SHA256_K); + COMPRESS_ROUND(i, 15, SHA256_K); +#endif + } + ctx->hash[0] += a; + ctx->hash[1] += b; + ctx->hash[2] += c; + ctx->hash[3] += d; + ctx->hash[4] += e; + ctx->hash[5] += f; + ctx->hash[6] += g; + ctx->hash[7] += h; + } +} -const uint_64t i512[80] = -{ - li_64(6a09e667f3bcc908), li_64(bb67ae8584caa73b), - li_64(3c6ef372fe94f82b), li_64(a54ff53a5f1d36f1), - li_64(510e527fade682d1), li_64(9b05688c2b3e6c1f), - li_64(1f83d9abfb41bd6b), li_64(5be0cd19137e2179) -}; +#ifndef NO_OPTIMIZED_VERSIONS -VOID_RETURN sha512_begin(sha512_ctx ctx[1]) +#if CRYPTOPP_BOOL_X64 +void Avx2Sha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks) { - ctx->count[0] = ctx->count[1] = 0; - memcpy(ctx->hash, i512, 8 * sizeof(uint_64t)); + if (num_blks > 1) + sha256_rorx(mp, ctx->hash, num_blks); + else + sha256_sse4(mp, ctx->hash, num_blks); } -VOID_RETURN sha512_end(unsigned char hval[], sha512_ctx ctx[1]) +void AvxSha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks) { - sha_end2(hval, ctx, SHA512_DIGEST_SIZE); + if (num_blks > 1) + sha256_avx(mp, ctx->hash, num_blks); + else + sha256_sse4(mp, ctx->hash, num_blks); } -VOID_RETURN sha512(unsigned char hval[], const unsigned char data[], unsigned long len) -{ sha512_ctx cx[1]; - - sha512_begin(cx); - sha512_hash(data, len, cx); - sha_end2(hval, cx, SHA512_DIGEST_SIZE); +void SSE4Sha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks) +{ + sha256_sse4(mp, ctx->hash, num_blks); } #endif -#if defined(SHA_2) - -#define CTX_224(x) ((x)->uu->ctx256) -#define CTX_256(x) ((x)->uu->ctx256) -#define CTX_384(x) ((x)->uu->ctx512) -#define CTX_512(x) ((x)->uu->ctx512) - -/* SHA2 initialisation */ - -INT_RETURN sha2_begin(unsigned long len, sha2_ctx ctx[1]) +#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE)) +void SSE2Sha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks) { - switch(len) - { -#if defined(SHA_224) - case 224: - case 28: CTX_256(ctx)->count[0] = CTX_256(ctx)->count[1] = 0; - memcpy(CTX_256(ctx)->hash, i224, 32); - ctx->sha2_len = 28; return EXIT_SUCCESS; -#endif -#if defined(SHA_256) - case 256: - case 32: CTX_256(ctx)->count[0] = CTX_256(ctx)->count[1] = 0; - memcpy(CTX_256(ctx)->hash, i256, 32); - ctx->sha2_len = 32; return EXIT_SUCCESS; -#endif -#if defined(SHA_384) - case 384: - case 48: CTX_384(ctx)->count[0] = CTX_384(ctx)->count[1] = 0; - memcpy(CTX_384(ctx)->hash, i384, 64); - ctx->sha2_len = 48; return EXIT_SUCCESS; -#endif -#if defined(SHA_512) - case 512: - case 64: CTX_512(ctx)->count[0] = CTX_512(ctx)->count[1] = 0; - memcpy(CTX_512(ctx)->hash, i512, 64); - ctx->sha2_len = 64; return EXIT_SUCCESS; -#endif - default: return EXIT_FAILURE; - } + X86_SHA256_HashBlocks(ctx->hash, (const uint_32t*)mp, (size_t)(num_blks * 64)); } +#endif -VOID_RETURN sha2_hash(const unsigned char data[], unsigned long len, sha2_ctx ctx[1]) +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 +void Sha256AsmTransform(sha256_ctx* ctx, void* mp, uint_64t num_blks) { - switch(ctx->sha2_len) - { -#if defined(SHA_224) - case 28: sha224_hash(data, len, CTX_224(ctx)); return; -#endif -#if defined(SHA_256) - case 32: sha256_hash(data, len, CTX_256(ctx)); return; -#endif -#if defined(SHA_384) - case 48: sha384_hash(data, len, CTX_384(ctx)); return; + uint_64t i; + for (i = 0; i < num_blks; i++) + sha256_compress_nayuki(ctx->hash, (uint_8t*)mp + i * 64); +} #endif -#if defined(SHA_512) - case 64: sha512_hash(data, len, CTX_512(ctx)); return; + #endif - } -} -VOID_RETURN sha2_end(unsigned char hval[], sha2_ctx ctx[1]) +void sha256_begin(sha256_ctx* ctx) { - switch(ctx->sha2_len) - { -#if defined(SHA_224) - case 28: sha_end1(hval, CTX_224(ctx), SHA224_DIGEST_SIZE); return; + ctx->hash[0] = 0x6a09e667; + ctx->hash[1] = 0xbb67ae85; + ctx->hash[2] = 0x3c6ef372; + ctx->hash[3] = 0xa54ff53a; + ctx->hash[4] = 0x510e527f; + ctx->hash[5] = 0x9b05688c; + ctx->hash[6] = 0x1f83d9ab; + ctx->hash[7] = 0x5be0cd19; + ctx->count[0] = 0; + ctx->count[1] = 0; + + if (!sha256transfunc) + { +#ifndef NO_OPTIMIZED_VERSIONS +#ifdef _M_X64 + if (g_isIntel && HasSAVX2() && HasSBMI2()) + sha256transfunc = Avx2Sha256Transform; + else if (g_isIntel && HasSAVX()) + sha256transfunc = AvxSha256Transform; + else if (HasSSE41()) + sha256transfunc = SSE4Sha256Transform; + else #endif -#if defined(SHA_256) - case 32: sha_end1(hval, CTX_256(ctx), SHA256_DIGEST_SIZE); return; + +#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE)) + if (HasSSE2 ()) + sha256transfunc = SSE2Sha256Transform; + else #endif -#if defined(SHA_384) - case 48: sha_end2(hval, CTX_384(ctx), SHA384_DIGEST_SIZE); return; + +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 + sha256transfunc = Sha256AsmTransform; +#else + sha256transfunc = StdSha256Transform; #endif -#if defined(SHA_512) - case 64: sha_end2(hval, CTX_512(ctx), SHA512_DIGEST_SIZE); return; +#else + sha256transfunc = StdSha256Transform; #endif - } + } } -INT_RETURN sha2(unsigned char hval[], unsigned long size, - const unsigned char data[], unsigned long len) -{ sha2_ctx cx[1]; +void sha256_end(unsigned char * result, sha256_ctx* ctx) +{ + int i; + uint_64t mlen, pos = ctx->count[0]; + uint_8t* m = (uint_8t*) ctx->wbuf; + m[pos++] = 0x80; + if (pos > 56) + { + memset(m + pos, 0, (size_t) (64 - pos)); + sha256transfunc(ctx, m, 1); + pos = 0; + } + memset(m + pos, 0, (size_t) (56 - pos)); + mlen = bswap_64((uint_64t) ctx->count[1]); + memcpy(m + (64 - 8), &mlen, 64 / 8); + sha256transfunc(ctx, m, 1); + for (i = 0; i < 8; i++) + { + ctx->hash[i] = bswap_32(ctx->hash[i]); + } + memcpy(result, ctx->hash, 32); +} - if(sha2_begin(size, cx) == EXIT_SUCCESS) - { - sha2_hash(data, len, cx); sha2_end(hval, cx); return EXIT_SUCCESS; - } - else - return EXIT_FAILURE; +void sha256_hash(const unsigned char * data, uint_32t len, sha256_ctx *ctx) +{ + uint_32t pos = ctx->count[0]; + uint_32t total = ctx->count[1]; + uint_8t* m = (uint_8t*) ctx->wbuf; + if (pos && pos + len >= 64) + { + memcpy(m + pos, data, 64 - pos); + sha256transfunc(ctx, m, 1); + len -= 64 - pos; + total += (64 - pos) * 8; + data += 64 - pos; + pos = 0; + } + if (len >= 64) + { + uint_32t blocks = len / 64; + uint_32t bytes = blocks * 64; + sha256transfunc(ctx, (void*)data, blocks); + len -= bytes; + total += (bytes)* 8; + data += bytes; + } + memcpy(m+pos, data, len); + pos += len; + total += len * 8; + ctx->count[0] = pos; + ctx->count[1] = total; } -#endif +void sha256(unsigned char * result, const unsigned char* source, uint_32t sourceLen) +{ + sha256_ctx ctx; -#if defined(__cplusplus) + sha256_begin(&ctx); + sha256_hash(source, sourceLen, &ctx); + sha256_end(result, &ctx); } -#endif diff --git a/src/Crypto/Sha2.h b/src/Crypto/Sha2.h index 6d0aeb0f..37625ce8 100644 --- a/src/Crypto/Sha2.h +++ b/src/Crypto/Sha2.h @@ -1,155 +1,60 @@ /* - --------------------------------------------------------------------------- - Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software is allowed (with or without - changes) provided that: - - 1. source code distributions include the above copyright notice, this - list of conditions and the following disclaimer; - - 2. binary distributions include the above copyright notice, this list - of conditions and the following disclaimer in their documentation; - - 3. the name of the copyright holder is not used to endorse products - built using this software without specific written permission. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 01/08/2005 -*/ + * Copyright (c) 2013-2017 IDRIX + * Governed by the Apache License 2.0 the full text of which is contained + * in the file License.txt included in VeraCrypt binary and source + * code distribution packages. + */ #ifndef _SHA2_H #define _SHA2_H #include "Common/Tcdefs.h" #include "Common/Endian.h" - -#define SHA_64BIT - -/* define the hash functions that you need */ -#define SHA_2 /* for dynamic hash length */ -#define SHA_224 -#define SHA_256 -#ifdef SHA_64BIT -# define SHA_384 -# define SHA_512 -# define NEED_UINT_64T -#endif - -#ifndef EXIT_SUCCESS -#define EXIT_SUCCESS 0 -#define EXIT_FAILURE 1 -#endif - -#define li_64(h) 0x##h##ull - -#define VOID_RETURN void -#define INT_RETURN int +#include "Crypto/config.h" #if defined(__cplusplus) -extern "C" -{ +extern "C" { #endif -/* Note that the following function prototypes are the same */ -/* for both the bit and byte oriented implementations. But */ -/* the length fields are in bytes or bits as is appropriate */ -/* for the version used. Bit sequences are arrays of bytes */ -/* in which bit sequence indexes increase from the most to */ -/* the least significant end of each byte */ - -#define SHA224_DIGEST_SIZE 28 -#define SHA224_BLOCK_SIZE 64 #define SHA256_DIGEST_SIZE 32 #define SHA256_BLOCK_SIZE 64 -/* type to hold the SHA256 (and SHA224) context */ - -typedef struct -{ uint_32t count[2]; - uint_32t hash[8]; - uint_32t wbuf[16]; -} sha256_ctx; - -typedef sha256_ctx sha224_ctx; - -VOID_RETURN sha256_compile(sha256_ctx ctx[1]); - -VOID_RETURN sha224_begin(sha224_ctx ctx[1]); -#define sha224_hash sha256_hash -VOID_RETURN sha224_end(unsigned char hval[], sha224_ctx ctx[1]); -VOID_RETURN sha224(unsigned char hval[], const unsigned char data[], unsigned long len); - -VOID_RETURN sha256_begin(sha256_ctx ctx[1]); -VOID_RETURN sha256_hash(const unsigned char data[], unsigned long len, sha256_ctx ctx[1]); -VOID_RETURN sha256_end(unsigned char hval[], sha256_ctx ctx[1]); -VOID_RETURN sha256(unsigned char hval[], const unsigned char data[], unsigned long len); - -#ifndef SHA_64BIT - -typedef struct -{ union - { sha256_ctx ctx256[1]; - } uu[1]; - uint_32t sha2_len; -} sha2_ctx; - -#define SHA2_MAX_DIGEST_SIZE SHA256_DIGEST_SIZE - -#else - -#define SHA384_DIGEST_SIZE 48 -#define SHA384_BLOCK_SIZE 128 #define SHA512_DIGEST_SIZE 64 #define SHA512_BLOCK_SIZE 128 -#define SHA2_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE -/* type to hold the SHA384 (and SHA512) context */ +#if CRYPTOPP_BOOL_X64 +#define SHA2_ALIGN CRYPTOPP_ALIGN_DATA(32) +#else +#define SHA2_ALIGN CRYPTOPP_ALIGN_DATA(16) +#endif typedef struct { uint_64t count[2]; - uint_64t hash[8]; - uint_64t wbuf[16]; + SHA2_ALIGN uint_64t hash[8]; + SHA2_ALIGN uint_64t wbuf[16]; } sha512_ctx; -typedef sha512_ctx sha384_ctx; - typedef struct -{ union - { sha256_ctx ctx256[1]; - sha512_ctx ctx512[1]; - } uu[1]; - uint_32t sha2_len; -} sha2_ctx; - -VOID_RETURN sha512_compile(sha512_ctx ctx[1]); - -VOID_RETURN sha384_begin(sha384_ctx ctx[1]); -#define sha384_hash sha512_hash -VOID_RETURN sha384_end(unsigned char hval[], sha384_ctx ctx[1]); -VOID_RETURN sha384(unsigned char hval[], const unsigned char data[], unsigned long len); +{ uint_32t count[2]; + SHA2_ALIGN uint_32t hash[8]; + SHA2_ALIGN uint_32t wbuf[16]; +} sha256_ctx; -VOID_RETURN sha512_begin(sha512_ctx ctx[1]); -VOID_RETURN sha512_hash(const unsigned char data[], unsigned long len, sha512_ctx ctx[1]); -VOID_RETURN sha512_end(unsigned char hval[], sha512_ctx ctx[1]); -VOID_RETURN sha512(unsigned char hval[], const unsigned char data[], unsigned long len); -INT_RETURN sha2_begin(unsigned long size, sha2_ctx ctx[1]); -VOID_RETURN sha2_hash(const unsigned char data[], unsigned long len, sha2_ctx ctx[1]); -VOID_RETURN sha2_end(unsigned char hval[], sha2_ctx ctx[1]); -INT_RETURN sha2(unsigned char hval[], unsigned long size, const unsigned char data[], unsigned long len); +void sha512_begin(sha512_ctx* ctx); +void sha512_hash(const unsigned char * source, uint_64t sourceLen, sha512_ctx *ctx); +void sha512_end(unsigned char * result, sha512_ctx* ctx); +void sha512(unsigned char * result, const unsigned char* source, uint_64t sourceLen); -#endif +void sha256_begin(sha256_ctx* ctx); +void sha256_hash(const unsigned char * source, uint_32t sourceLen, sha256_ctx *ctx); +void sha256_end(unsigned char * result, sha256_ctx* ctx); +void sha256(unsigned char * result, const unsigned char* source, uint_32t sourceLen); #if defined(__cplusplus) } #endif + + #endif diff --git a/src/Crypto/Sources b/src/Crypto/Sources index 60412bf1..a93f9530 100644 --- a/src/Crypto/Sources +++ b/src/Crypto/Sources @@ -9,7 +9,15 @@ NTTARGETFILES = \ "$(OBJ_PATH)\$(O)\gost89_$(TC_ARCH).obj" \ "$(OBJ_PATH)\$(O)\Twofish_$(TC_ARCH).obj" \ "$(OBJ_PATH)\$(O)\Camellia_$(TC_ARCH).obj" \ - "$(OBJ_PATH)\$(O)\Camellia_aesni_$(TC_ARCH).obj" + "$(OBJ_PATH)\$(O)\Camellia_aesni_$(TC_ARCH).obj" \ + "$(OBJ_PATH)\$(O)\sha256-$(TC_ARCH)-nayuki.obj" \ + "$(OBJ_PATH)\$(O)\sha512-$(TC_ARCH)-nayuki.obj" \ + "$(OBJ_PATH)\$(O)\sha512_avx1_$(TC_ARCH).obj" \ + "$(OBJ_PATH)\$(O)\sha512_avx2_$(TC_ARCH).obj" \ + "$(OBJ_PATH)\$(O)\sha512_sse4_$(TC_ARCH).obj" \ + "$(OBJ_PATH)\$(O)\sha256_avx1_$(TC_ARCH).obj" \ + "$(OBJ_PATH)\$(O)\sha256_avx2_$(TC_ARCH).obj" \ + "$(OBJ_PATH)\$(O)\sha256_sse4_$(TC_ARCH).obj" SOURCES = \ Aes_$(TC_ARCH).asm \ @@ -30,5 +38,12 @@ SOURCES = \ Whirlpool.c \ Camellia.c \ Camellia_$(TC_ARCH).S \ - Camellia_aesni_$(TC_ARCH).S - + Camellia_aesni_$(TC_ARCH).S \ + sha256-$(TC_ARCH)-nayuki.S \ + sha512-$(TC_ARCH)-nayuki.S \ + sha512_avx1_$(TC_ARCH).asm \ + sha512_avx2_$(TC_ARCH).asm \ + sha512_sse4_$(TC_ARCH).asm \ + sha256_avx1_$(TC_ARCH).asm \ + sha256_avx2_$(TC_ARCH).asm \ + sha256_sse4_$(TC_ARCH).asm diff --git a/src/Crypto/sha256-x64-nayuki.S b/src/Crypto/sha256-x64-nayuki.S new file mode 100644 index 00000000..c6dd16d1 --- /dev/null +++ b/src/Crypto/sha256-x64-nayuki.S @@ -0,0 +1,6 @@ + + .ifndef WINABI +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + .endif \ No newline at end of file diff --git a/src/Crypto/sha256-x86-nayuki.S b/src/Crypto/sha256-x86-nayuki.S new file mode 100644 index 00000000..a8e25db7 --- /dev/null +++ b/src/Crypto/sha256-x86-nayuki.S @@ -0,0 +1,168 @@ +/* + * SHA-256 hash in x86 assembly + * + * Copyright (c) 2014 Project Nayuki. (MIT License) + * https://www.nayuki.io/page/fast-sha2-hashes-in-x86-assembly + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * - The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * - The Software is provided "as is", without warranty of any kind, express or + * implied, including but not limited to the warranties of merchantability, + * fitness for a particular purpose and noninfringement. In no event shall the + * authors or copyright holders be liable for any claim, damages or other + * liability, whether in an action of contract, tort or otherwise, arising from, + * out of or in connection with the Software or the use or other dealings in the + * Software. + */ + + +/* void sha256_compress_nayuki(uint32_t state[8], const uint8_t block[64]) */ + + .ifdef MS_STDCALL + .globl _sha256_compress_nayuki@8 + _sha256_compress_nayuki@8: + .else + .globl sha256_compress_nayuki + .globl _sha256_compress_nayuki + sha256_compress_nayuki: + _sha256_compress_nayuki: + .endif + + /* + * Storage usage: + * Bytes Location Description + * 4 eax Temporary for calculation per round + * 4 ebx Temporary for calculation per round + * 4 ecx Temporary for calculation per round + * 4 edx Temporary for calculation per round + * 4 ebp Temporary for calculation per round + * 4 esi (During state loading and update) base address of state array argument + * (During hash rounds) temporary for calculation per round + * 4 edi Base address of block array argument (during key schedule loading rounds only) + * 4 esp x86 stack pointer + * 32 [esp+ 0] SHA-256 state variables A,B,C,D,E,F,G,H (4 bytes each) + * 64 [esp+ 32] Key schedule of 16 * 4 bytes + * 4 [esp+ 96] Caller's value of ebx + * 4 [esp+100] Caller's value of esi + * 4 [esp+104] Caller's value of edi + * 4 [esp+108] Caller's value of ebp + */ + + subl $112, %esp + movl %ebx, 96(%esp) + movl %esi, 100(%esp) + movl %edi, 104(%esp) + movl %ebp, 108(%esp) + + + movl 116(%esp), %esi + movl 0(%esi), %eax; movl %eax, 0(%esp) + movl 4(%esi), %eax; movl %eax, 4(%esp) + movl 8(%esi), %eax; movl %eax, 8(%esp) + movl 12(%esi), %eax; movl %eax, 12(%esp) + movl 16(%esi), %eax; movl %eax, 16(%esp) + movl 20(%esi), %eax; movl %eax, 20(%esp) + movl 24(%esi), %eax; movl %eax, 24(%esp) + movl 28(%esi), %eax; movl %eax, 28(%esp) + + + movl 120(%esp), %edi + movl (0*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((0)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x428A2F98(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp); + movl (1*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((1)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x71374491(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp); + movl (2*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((2)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xB5C0FBCF(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp); + movl (3*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((3)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xE9B5DBA5(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp); + movl (4*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((4)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x3956C25B(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp); + movl (5*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((5)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x59F111F1(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp); + movl (6*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((6)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x923F82A4(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp); + movl (7*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((7)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xAB1C5ED5(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp); + movl (8*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((8)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xD807AA98(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp); + movl (9*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((9)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x12835B01(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp); + movl (10*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((10)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x243185BE(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp); + movl (11*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((11)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x550C7DC3(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp); + movl (12*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((12)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x72BE5D74(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp); + movl (13*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((13)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x80DEB1FE(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp); + movl (14*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((14)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x9BDC06A7(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp); + movl (15*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((15)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xC19BF174(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp); + movl ((((16 -15)&0xF)+8)*4)(%esp), %eax; movl ((((16 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((16 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((16 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((16)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xE49B69C1(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp); + movl ((((17 -15)&0xF)+8)*4)(%esp), %eax; movl ((((17 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((17 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((17 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((17)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xEFBE4786(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp); + movl ((((18 -15)&0xF)+8)*4)(%esp), %eax; movl ((((18 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((18 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((18 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((18)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x0FC19DC6(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp); + movl ((((19 -15)&0xF)+8)*4)(%esp), %eax; movl ((((19 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((19 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((19 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((19)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x240CA1CC(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp); + movl ((((20 -15)&0xF)+8)*4)(%esp), %eax; movl ((((20 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((20 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((20 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((20)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x2DE92C6F(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp); + movl ((((21 -15)&0xF)+8)*4)(%esp), %eax; movl ((((21 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((21 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((21 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((21)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x4A7484AA(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp); + movl ((((22 -15)&0xF)+8)*4)(%esp), %eax; movl ((((22 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((22 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((22 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((22)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x5CB0A9DC(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp); + movl ((((23 -15)&0xF)+8)*4)(%esp), %eax; movl ((((23 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((23 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((23 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((23)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x76F988DA(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp); + movl ((((24 -15)&0xF)+8)*4)(%esp), %eax; movl ((((24 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((24 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((24 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((24)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x983E5152(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp); + movl ((((25 -15)&0xF)+8)*4)(%esp), %eax; movl ((((25 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((25 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((25 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((25)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xA831C66D(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp); + movl ((((26 -15)&0xF)+8)*4)(%esp), %eax; movl ((((26 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((26 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((26 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((26)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xB00327C8(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp); + movl ((((27 -15)&0xF)+8)*4)(%esp), %eax; movl ((((27 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((27 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((27 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((27)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xBF597FC7(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp); + movl ((((28 -15)&0xF)+8)*4)(%esp), %eax; movl ((((28 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((28 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((28 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((28)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xC6E00BF3(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp); + movl ((((29 -15)&0xF)+8)*4)(%esp), %eax; movl ((((29 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((29 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((29 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((29)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xD5A79147(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp); + movl ((((30 -15)&0xF)+8)*4)(%esp), %eax; movl ((((30 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((30 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((30 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((30)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x06CA6351(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp); + movl ((((31 -15)&0xF)+8)*4)(%esp), %eax; movl ((((31 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((31 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((31 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((31)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x14292967(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp); + movl ((((32 -15)&0xF)+8)*4)(%esp), %eax; movl ((((32 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((32 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((32 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((32)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x27B70A85(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp); + movl ((((33 -15)&0xF)+8)*4)(%esp), %eax; movl ((((33 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((33 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((33 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((33)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x2E1B2138(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp); + movl ((((34 -15)&0xF)+8)*4)(%esp), %eax; movl ((((34 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((34 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((34 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((34)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x4D2C6DFC(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp); + movl ((((35 -15)&0xF)+8)*4)(%esp), %eax; movl ((((35 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((35 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((35 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((35)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x53380D13(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp); + movl ((((36 -15)&0xF)+8)*4)(%esp), %eax; movl ((((36 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((36 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((36 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((36)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x650A7354(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp); + movl ((((37 -15)&0xF)+8)*4)(%esp), %eax; movl ((((37 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((37 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((37 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((37)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x766A0ABB(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp); + movl ((((38 -15)&0xF)+8)*4)(%esp), %eax; movl ((((38 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((38 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((38 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((38)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x81C2C92E(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp); + movl ((((39 -15)&0xF)+8)*4)(%esp), %eax; movl ((((39 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((39 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((39 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((39)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x92722C85(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp); + movl ((((40 -15)&0xF)+8)*4)(%esp), %eax; movl ((((40 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((40 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((40 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((40)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xA2BFE8A1(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp); + movl ((((41 -15)&0xF)+8)*4)(%esp), %eax; movl ((((41 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((41 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((41 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((41)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xA81A664B(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp); + movl ((((42 -15)&0xF)+8)*4)(%esp), %eax; movl ((((42 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((42 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((42 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((42)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xC24B8B70(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp); + movl ((((43 -15)&0xF)+8)*4)(%esp), %eax; movl ((((43 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((43 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((43 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((43)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xC76C51A3(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp); + movl ((((44 -15)&0xF)+8)*4)(%esp), %eax; movl ((((44 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((44 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((44 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((44)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xD192E819(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp); + movl ((((45 -15)&0xF)+8)*4)(%esp), %eax; movl ((((45 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((45 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((45 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((45)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xD6990624(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp); + movl ((((46 -15)&0xF)+8)*4)(%esp), %eax; movl ((((46 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((46 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((46 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((46)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xF40E3585(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp); + movl ((((47 -15)&0xF)+8)*4)(%esp), %eax; movl ((((47 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((47 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((47 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((47)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x106AA070(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp); + movl ((((48 -15)&0xF)+8)*4)(%esp), %eax; movl ((((48 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((48 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((48 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((48)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x19A4C116(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp); + movl ((((49 -15)&0xF)+8)*4)(%esp), %eax; movl ((((49 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((49 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((49 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((49)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x1E376C08(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp); + movl ((((50 -15)&0xF)+8)*4)(%esp), %eax; movl ((((50 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((50 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((50 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((50)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x2748774C(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp); + movl ((((51 -15)&0xF)+8)*4)(%esp), %eax; movl ((((51 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((51 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((51 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((51)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x34B0BCB5(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp); + movl ((((52 -15)&0xF)+8)*4)(%esp), %eax; movl ((((52 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((52 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((52 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((52)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x391C0CB3(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp); + movl ((((53 -15)&0xF)+8)*4)(%esp), %eax; movl ((((53 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((53 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((53 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((53)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x4ED8AA4A(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp); + movl ((((54 -15)&0xF)+8)*4)(%esp), %eax; movl ((((54 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((54 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((54 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((54)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x5B9CCA4F(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp); + movl ((((55 -15)&0xF)+8)*4)(%esp), %eax; movl ((((55 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((55 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((55 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((55)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x682E6FF3(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp); + movl ((((56 -15)&0xF)+8)*4)(%esp), %eax; movl ((((56 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((56 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((56 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((56)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x748F82EE(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp); + movl ((((57 -15)&0xF)+8)*4)(%esp), %eax; movl ((((57 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((57 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((57 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((57)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x78A5636F(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp); + movl ((((58 -15)&0xF)+8)*4)(%esp), %eax; movl ((((58 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((58 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((58 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((58)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x84C87814(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp); + movl ((((59 -15)&0xF)+8)*4)(%esp), %eax; movl ((((59 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((59 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((59 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((59)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x8CC70208(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp); + movl ((((60 -15)&0xF)+8)*4)(%esp), %eax; movl ((((60 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((60 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((60 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((60)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x90BEFFFA(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp); + movl ((((61 -15)&0xF)+8)*4)(%esp), %eax; movl ((((61 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((61 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((61 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((61)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xA4506CEB(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp); + movl ((((62 -15)&0xF)+8)*4)(%esp), %eax; movl ((((62 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((62 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((62 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((62)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xBEF9A3F7(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp); + movl ((((63 -15)&0xF)+8)*4)(%esp), %eax; movl ((((63 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((63 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((63 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((63)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xC67178F2(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp); + + + movl 116(%esp), %esi + movl 0(%esp), %eax; addl %eax, 0(%esi) + movl 4(%esp), %eax; addl %eax, 4(%esi) + movl 8(%esp), %eax; addl %eax, 8(%esi) + movl 12(%esp), %eax; addl %eax, 12(%esi) + movl 16(%esp), %eax; addl %eax, 16(%esi) + movl 20(%esp), %eax; addl %eax, 20(%esi) + movl 24(%esp), %eax; addl %eax, 24(%esi) + movl 28(%esp), %eax; addl %eax, 28(%esi) + + + movl 96(%esp), %ebx + movl 100(%esp), %esi + movl 104(%esp), %edi + movl 108(%esp), %ebp + addl $112, %esp + .ifdef MS_STDCALL + ret $8 + .else + retl + .endif + + .ifndef WINABI +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + .endif \ No newline at end of file diff --git a/src/Crypto/sha256_avx1_x64.asm b/src/Crypto/sha256_avx1_x64.asm new file mode 100644 index 00000000..5c4ce559 --- /dev/null +++ b/src/Crypto/sha256_avx1_x64.asm @@ -0,0 +1,596 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm +; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; This code is described in an Intel White-Paper: +; "Fast SHA-256 Implementations on Intel Architecture Processors" +; +; To find it, surf to http://www.intel.com/p/en_US/embedded +; and search for that title. +; The paper is expected to be released roughly at the end of April, 2012 +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +; addm [mem], reg +; Add reg to mem using reg-mem add and store +%macro addm 2 + add %2, %1 + mov %1, %2 +%endm + +%macro MY_ROR 2 + shld %1,%1,(32-(%2)) +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + VMOVDQ %1, %2 + vpshufb %1, %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XTMP4 xmm8 +%define XFER xmm9 +%define XTMP5 xmm11 + +%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA +%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00 +%define BYTE_FLIP_MASK xmm13 + +%ifndef WINABI +%define NUM_BLKS rdx ; 3rd arg +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c ecx +%define d r8d +%define e edx +%else +%define NUM_BLKS r8 ; 3rd arg +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c edi +%define d esi +%define e r8d + +%endif +%define TBL rbp +%define a eax +%define b ebx + +%define f r9d +%define g r10d +%define h r11d + +%define y0 r13d +%define y1 r14d +%define y2 r15d + + +_INP_END_SIZE equ 8 +_INP_SIZE equ 8 +_XFER_SIZE equ 8 +%ifndef WINABI +_XMM_SAVE_SIZE equ 0 +%else +_XMM_SAVE_SIZE equ 8*16 +%endif +; STACK_SIZE plus pushes must be an odd multiple of 8 +_ALIGN_SIZE equ 8 + +_INP_END equ 0 +_INP equ _INP_END + _INP_END_SIZE +_XFER equ _INP + _INP_SIZE +_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE +STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE + +; rotate_Xs +; Rotate values of symbols X0...X3 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro FOUR_ROUNDS_AND_SCHED 0 + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + ;vmovdqa XTMP0, X3 + mov y0, e ; y0 = e + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7] + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + ;vmovdqa XTMP1, X1 + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + ;; compute s0 + vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + + + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH + + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + + vpsrld XTMP2, XTMP1, 7 + + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + + vpslld XTMP3, XTMP1, (32-7) + + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + + vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 + + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + + mov y0, e ; y0 = e + mov y1, a ; y1 = a + + + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + + vpsrld XTMP2, XTMP1,18 + + xor y1, a ; y1 = a ^ (a >> (22-13) + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + + vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3 + + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + + vpslld XTMP1, XTMP1, (32-18) + + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + + vpxor XTMP3, XTMP3, XTMP1 + + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + + vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 + + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + + vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0 + + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + ;; compute low s1 + vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} + + mov y0, e ; y0 = e + mov y1, a ; y1 = a + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + + ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} + + xor y0, e ; y0 = e ^ (e >> (25-11)) + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + + vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA} + + xor y2, g ; y2 = f^g + + vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA} + + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + + vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA} + + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + vpxor XTMP2, XTMP2, XTMP3 + add y2, y0 ; y2 = S1 + CH + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH + vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + ;; compute high s1 + vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} + mov y0, e ; y0 = e + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC} + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + + vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC} + + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + + vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC} + + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + + vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC} + + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + + vpxor XTMP2, XTMP2, XTMP3 + + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH + vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 4] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (25-11)) + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +;; arg 3 : Num blocks +section .text +global sha256_avx +align 32 +sha256_avx: + push rbx +%ifdef WINABI + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_SIZE +%ifdef WINABI + vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 + vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 + vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 + vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 + vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 + vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 + vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 + vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 +%endif + + shl NUM_BLKS, 6 ; convert to bytes + jz done_hash + add NUM_BLKS, INP ; pointer to end of data + mov [rsp + _INP_END], NUM_BLKS + + ;; load initial digest + mov a,[4*0 + CTX] + mov b,[4*1 + CTX] + mov c,[4*2 + CTX] + mov d,[4*3 + CTX] + mov e,[4*4 + CTX] + mov f,[4*5 + CTX] + mov g,[4*6 + CTX] + mov h,[4*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] + vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip] + vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip] + +loop0: + lea TBL,[K256 wrt rip] + + ;; byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + mov [rsp + _INP], INP + + ;; schedule 48 input dwords, by doing 3 rounds of 16 each + mov SRND, 3 +align 16 +loop1: + vpaddd XFER, X0, [TBL + 0*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 1*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 2*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 3*16] + vmovdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 +loop2: + vpaddd XFER, X0, [TBL + 0*16] + vmovdqa [rsp + _XFER], XFER + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vpaddd XFER, X1, [TBL + 1*16] + vmovdqa [rsp + _XFER], XFER + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vmovdqa X0, X2 + vmovdqa X1, X3 + + sub SRND, 1 + jne loop2 + + + addm [4*0 + CTX],a + addm [4*1 + CTX],b + addm [4*2 + CTX],c + addm [4*3 + CTX],d + addm [4*4 + CTX],e + addm [4*5 + CTX],f + addm [4*6 + CTX],g + addm [4*7 + CTX],h + + mov INP, [rsp + _INP] + add INP, 64 + cmp INP, [rsp + _INP_END] + jne loop0 + +done_hash: +%ifdef WINABI + vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] + vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] + vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] + vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] + vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] + vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] + vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] + vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] +%endif + + + add rsp, STACK_SIZE + + pop r15 + pop r14 + pop r13 + pop rbp +%ifdef WINABI + pop rdi + pop rsi +%endif + pop rbx + + ret + + +section .data +align 64 +K256: + dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203 + +; shuffle xBxA -> 00BA +_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +; shuffle xDxC -> DC00 +_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha256_avx1_x86.asm b/src/Crypto/sha256_avx1_x86.asm new file mode 100644 index 00000000..31c8bd0d --- /dev/null +++ b/src/Crypto/sha256_avx1_x86.asm @@ -0,0 +1,10 @@ + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha256_avx2_x64.asm b/src/Crypto/sha256_avx2_x64.asm new file mode 100644 index 00000000..458c2945 --- /dev/null +++ b/src/Crypto/sha256_avx2_x64.asm @@ -0,0 +1,840 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx2_rorx2.obj -g cv8 sha256_avx2_rorx2.asm +; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx2_rorx2.o sha256_avx2_rorx2.asm +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; This code is described in an Intel White-Paper: +; "Fast SHA-256 Implementations on Intel Architecture Processors" +; +; To find it, surf to http://www.intel.com/p/en_US/embedded +; and search for that title. +; The paper is expected to be released roughly at the end of April, 2012 +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 2 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Modified by kerukuro for use in cppcrypto. + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +; addm [mem], reg +; Add reg to mem using reg-mem add and store +%macro addm 2 + add %2, %1 + mov %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 ymm4 +%define X1 ymm5 +%define X2 ymm6 +%define X3 ymm7 + +; XMM versions of above +%define XWORD0 xmm4 +%define XWORD1 xmm5 +%define XWORD2 xmm6 +%define XWORD3 xmm7 + +%define XTMP0 ymm0 +%define XTMP1 ymm1 +%define XTMP2 ymm2 +%define XTMP3 ymm3 +%define XTMP4 ymm8 +%define XFER ymm9 +%define XTMP5 ymm11 + +%define SHUF_00BA ymm10 ; shuffle xBxA -> 00BA +%define SHUF_DC00 ymm12 ; shuffle xDxC -> DC00 +%define BYTE_FLIP_MASK ymm13 + +%define X_BYTE_FLIP_MASK xmm13 ; XMM version of BYTE_FLIP_MASK + +%ifndef WINABI +%define NUM_BLKS rdx ; 3rd arg +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg +%define c ecx +%define d r8d +%define e edx ; clobbers NUM_BLKS +%define y3 edi ; clobbers INP +%else +%define NUM_BLKS r8 ; 3rd arg +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg +%define c edi +%define d esi +%define e r8d ; clobbers NUM_BLKS +%define y3 ecx ; clobbers INP + +%endif + + +%define TBL rbp +%define SRND CTX ; SRND is same register as CTX + +%define a eax +%define b ebx +%define f r9d +%define g r10d +%define h r11d +%define old_h r11d + +%define T1 r12d +%define y0 r13d +%define y1 r14d +%define y2 r15d + + +_XFER_SIZE equ 2*64*4 ; 2 blocks, 64 rounds, 4 bytes/round +%ifndef WINABI +_XMM_SAVE_SIZE equ 0 +%else +_XMM_SAVE_SIZE equ 8*16 +%endif +_INP_END_SIZE equ 8 +_INP_SIZE equ 8 +_CTX_SIZE equ 8 +_RSP_SIZE equ 8 + +_XFER equ 0 +_XMM_SAVE equ _XFER + _XFER_SIZE +_INP_END equ _XMM_SAVE + _XMM_SAVE_SIZE +_INP equ _INP_END + _INP_END_SIZE +_CTX equ _INP + _INP_SIZE +_RSP equ _CTX + _CTX_SIZE +STACK_SIZE equ _RSP + _RSP_SIZE + +; rotate_Xs +; Rotate values of symbols X0...X3 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine old_h h +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro FOUR_ROUNDS_AND_SCHED 1 +%define %%XFER %1 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov y3, a ; y3 = a ; MAJA + rorx y0, e, 25 ; y0 = e >> 25 ; S1A + rorx y1, e, 11 ; y1 = e >> 11 ; S1B + + add h, dword[%%XFER+0*4] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7] + mov y2, f ; y2 = f ; CH + rorx T1, a, 13 ; T1 = a >> 13 ; S0B + + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 + xor y2, g ; y2 = f^g ; CH + vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 + rorx y1, e, 6 ; y1 = (e >> 6) ; S1 + + and y2, e ; y2 = (f^g)&e ; CH + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 + rorx y1, a, 22 ; y1 = a >> 22 ; S0A + add d, h ; d = k + w + h + d ; -- + + and y3, b ; y3 = (a|c)&b ; MAJA + vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15] + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 + rorx T1, a, 2 ; T1 = (a >> 2) ; S0 + + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + vpsrld XTMP2, XTMP1, 7 + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 + mov T1, a ; T1 = a ; MAJB + and T1, c ; T1 = a&c ; MAJB + + add y2, y0 ; y2 = S1 + CH ; -- + vpslld XTMP3, XTMP1, (32-7) + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + vpor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] ror 7 + + vpsrld XTMP2, XTMP1,18 + add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + add h, y3 ; h = t1 + S0 + MAJ ; -- + + +ROTATE_ARGS + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + mov y3, a ; y3 = a ; MAJA + rorx y0, e, 25 ; y0 = e >> 25 ; S1A + rorx y1, e, 11 ; y1 = e >> 11 ; S1B + add h, dword[%%XFER+1*4] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + + vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3 + mov y2, f ; y2 = f ; CH + rorx T1, a, 13 ; T1 = a >> 13 ; S0B + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 + xor y2, g ; y2 = f^g ; CH + + + rorx y1, e, 6 ; y1 = (e >> 6) ; S1 + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 + rorx y1, a, 22 ; y1 = a >> 22 ; S0A + and y2, e ; y2 = (f^g)&e ; CH + add d, h ; d = k + w + h + d ; -- + + vpslld XTMP1, XTMP1, (32-18) + and y3, b ; y3 = (a|c)&b ; MAJA + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 + + vpxor XTMP3, XTMP3, XTMP1 + rorx T1, a, 2 ; T1 = (a >> 2) ; S0 + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + + vpxor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 + mov T1, a ; T1 = a ; MAJB + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0 + vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + add h, y3 ; h = t1 + S0 + MAJ ; -- + + vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA} + + +ROTATE_ARGS + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov y3, a ; y3 = a ; MAJA + rorx y0, e, 25 ; y0 = e >> 25 ; S1A + add h, [%%XFER+2*4] ; h = k + w + h ; -- + + vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xBxA} + rorx y1, e, 11 ; y1 = e >> 11 ; S1B + or y3, c ; y3 = a|c ; MAJA + mov y2, f ; y2 = f ; CH + xor y2, g ; y2 = f^g ; CH + + rorx T1, a, 13 ; T1 = a >> 13 ; S0B + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 + vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} + and y2, e ; y2 = (f^g)&e ; CH + + rorx y1, e, 6 ; y1 = (e >> 6) ; S1 + vpxor XTMP2, XTMP2, XTMP3 + add d, h ; d = k + w + h + d ; -- + and y3, b ; y3 = (a|c)&b ; MAJA + + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 + rorx y1, a, 22 ; y1 = a >> 22 ; S0A + vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + + vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 + rorx T1, a, 2 ; T1 = (a >> 2) ; S0 + vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} + + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 + mov T1, a ; T1 = a ; MAJB + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} + + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + + add h, y3 ; h = t1 + S0 + MAJ ; -- + + +ROTATE_ARGS + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov y3, a ; y3 = a ; MAJA + rorx y0, e, 25 ; y0 = e >> 25 ; S1A + rorx y1, e, 11 ; y1 = e >> 11 ; S1B + add h, dword[%%XFER+3*4] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + + vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC} + mov y2, f ; y2 = f ; CH + rorx T1, a, 13 ; T1 = a >> 13 ; S0B + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 + xor y2, g ; y2 = f^g ; CH + + + vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xDxC} + rorx y1, e, 6 ; y1 = (e >> 6) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + add d, h ; d = k + w + h + d ; -- + and y3, b ; y3 = (a|c)&b ; MAJA + + vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + + vpxor XTMP2, XTMP2, XTMP3 + rorx y1, a, 22 ; y1 = a >> 22 ; S0A + add y2, y0 ; y2 = S1 + CH ; -- + + vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC} + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + rorx T1, a, 2 ; T1 = (a >> 2) ; S0 + vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00} + + vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 + mov T1, a ; T1 = a ; MAJB + and T1, c ; T1 = a&c ; MAJB + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + + add h, y1 ; h = k + w + h + S0 ; -- + add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + add h, y3 ; h = t1 + S0 + MAJ ; -- + +ROTATE_ARGS +rotate_Xs +%endm + +%macro DO_4ROUNDS 1 +%define %%XFER %1 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov y2, f ; y2 = f ; CH + rorx y0, e, 25 ; y0 = e >> 25 ; S1A + rorx y1, e, 11 ; y1 = e >> 11 ; S1B + xor y2, g ; y2 = f^g ; CH + + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 + rorx y1, e, 6 ; y1 = (e >> 6) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 + rorx T1, a, 13 ; T1 = a >> 13 ; S0B + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + rorx y1, a, 22 ; y1 = a >> 22 ; S0A + mov y3, a ; y3 = a ; MAJA + + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 + rorx T1, a, 2 ; T1 = (a >> 2) ; S0 + add h, dword[%%XFER + 4*0] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 + mov T1, a ; T1 = a ; MAJB + and y3, b ; y3 = (a|c)&b ; MAJA + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + + add d, h ; d = k + w + h + d ; -- + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + + ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + + ;add h, y3 ; h = t1 + S0 + MAJ ; -- + + ROTATE_ARGS + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; + + add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + mov y2, f ; y2 = f ; CH + rorx y0, e, 25 ; y0 = e >> 25 ; S1A + rorx y1, e, 11 ; y1 = e >> 11 ; S1B + xor y2, g ; y2 = f^g ; CH + + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 + rorx y1, e, 6 ; y1 = (e >> 6) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + add old_h, y3 ; h = t1 + S0 + MAJ ; -- + + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 + rorx T1, a, 13 ; T1 = a >> 13 ; S0B + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + rorx y1, a, 22 ; y1 = a >> 22 ; S0A + mov y3, a ; y3 = a ; MAJA + + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 + rorx T1, a, 2 ; T1 = (a >> 2) ; S0 + add h, dword[%%XFER + 4*1] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 + mov T1, a ; T1 = a ; MAJB + and y3, b ; y3 = (a|c)&b ; MAJA + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + + add d, h ; d = k + w + h + d ; -- + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + + ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + + ;add h, y3 ; h = t1 + S0 + MAJ ; -- + + ROTATE_ARGS + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + mov y2, f ; y2 = f ; CH + rorx y0, e, 25 ; y0 = e >> 25 ; S1A + rorx y1, e, 11 ; y1 = e >> 11 ; S1B + xor y2, g ; y2 = f^g ; CH + + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 + rorx y1, e, 6 ; y1 = (e >> 6) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + add old_h, y3 ; h = t1 + S0 + MAJ ; -- + + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 + rorx T1, a, 13 ; T1 = a >> 13 ; S0B + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + rorx y1, a, 22 ; y1 = a >> 22 ; S0A + mov y3, a ; y3 = a ; MAJA + + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 + rorx T1, a, 2 ; T1 = (a >> 2) ; S0 + add h, dword[%%XFER + 4*2] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 + mov T1, a ; T1 = a ; MAJB + and y3, b ; y3 = (a|c)&b ; MAJA + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + + add d, h ; d = k + w + h + d ; -- + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + + ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + + ;add h, y3 ; h = t1 + S0 + MAJ ; -- + + ROTATE_ARGS + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; + + add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + mov y2, f ; y2 = f ; CH + rorx y0, e, 25 ; y0 = e >> 25 ; S1A + rorx y1, e, 11 ; y1 = e >> 11 ; S1B + xor y2, g ; y2 = f^g ; CH + + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1 + rorx y1, e, 6 ; y1 = (e >> 6) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + add old_h, y3 ; h = t1 + S0 + MAJ ; -- + + xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 + rorx T1, a, 13 ; T1 = a >> 13 ; S0B + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + rorx y1, a, 22 ; y1 = a >> 22 ; S0A + mov y3, a ; y3 = a ; MAJA + + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0 + rorx T1, a, 2 ; T1 = (a >> 2) ; S0 + add h, dword[%%XFER + 4*3] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 + mov T1, a ; T1 = a ; MAJB + and y3, b ; y3 = (a|c)&b ; MAJA + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + + add d, h ; d = k + w + h + d ; -- + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + + add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- + + add h, y3 ; h = t1 + S0 + MAJ ; -- + + ROTATE_ARGS + +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +;; arg 3 : Num blocks +section .text +global sha256_rorx +global _sha256_rorx +align 32 +sha256_rorx: +_sha256_rorx: + push rbx +%ifdef WINABI + push rsi + push rdi +%endif + push rbp + push r12 + push r13 + push r14 + push r15 + + mov rax, rsp + sub rsp,STACK_SIZE + and rsp, -32 + mov [rsp + _RSP], rax + +%ifdef WINABI + vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 + vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 + vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 + vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 + vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 + vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 + vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 + vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 +%endif + + shl NUM_BLKS, 6 ; convert to bytes + jz done_hash + lea NUM_BLKS, [NUM_BLKS + INP - 64] ; pointer to last block + mov [rsp + _INP_END], NUM_BLKS + + cmp INP, NUM_BLKS + je only_one_block + + ;; load initial digest + mov a,[4*0 + CTX] + mov b,[4*1 + CTX] + mov c,[4*2 + CTX] + mov d,[4*3 + CTX] + mov e,[4*4 + CTX] + mov f,[4*5 + CTX] + mov g,[4*6 + CTX] + mov h,[4*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] + vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip] + vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip] + + mov [rsp + _CTX], CTX + +loop0: + lea TBL,[K256 wrt rip] + + ;; Load first 16 dwords from two blocks + VMOVDQ XTMP0, [INP + 0*32] + VMOVDQ XTMP1, [INP + 1*32] + VMOVDQ XTMP2, [INP + 2*32] + VMOVDQ XTMP3, [INP + 3*32] + + ;; byte swap data + vpshufb XTMP0, XTMP0, BYTE_FLIP_MASK + vpshufb XTMP1, XTMP1, BYTE_FLIP_MASK + vpshufb XTMP2, XTMP2, BYTE_FLIP_MASK + vpshufb XTMP3, XTMP3, BYTE_FLIP_MASK + + ;; transpose data into high/low halves + vperm2i128 X0, XTMP0, XTMP2, 0x20 + vperm2i128 X1, XTMP0, XTMP2, 0x31 + vperm2i128 X2, XTMP1, XTMP3, 0x20 + vperm2i128 X3, XTMP1, XTMP3, 0x31 + +last_block_enter: + add INP, 64 + mov [rsp + _INP], INP + + ;; schedule 48 input dwords, by doing 3 rounds of 12 each + xor SRND, SRND + +align 16 +loop1: + vpaddd XFER, X0, [TBL + SRND + 0*32] + vmovdqa [rsp + _XFER + SRND + 0*32], XFER + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32 + + vpaddd XFER, X0, [TBL + SRND + 1*32] + vmovdqa [rsp + _XFER + SRND + 1*32], XFER + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32 + + vpaddd XFER, X0, [TBL + SRND + 2*32] + vmovdqa [rsp + _XFER + SRND + 2*32], XFER + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32 + + vpaddd XFER, X0, [TBL + SRND + 3*32] + vmovdqa [rsp + _XFER + SRND + 3*32], XFER + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32 + + add SRND, 4*32 + cmp SRND, 3 * 4*32 + jb loop1 + +loop2: + ;; Do last 16 rounds with no scheduling + vpaddd XFER, X0, [TBL + SRND + 0*32] + vmovdqa [rsp + _XFER + SRND + 0*32], XFER + DO_4ROUNDS rsp + _XFER + SRND + 0*32 + vpaddd XFER, X1, [TBL + SRND + 1*32] + vmovdqa [rsp + _XFER + SRND + 1*32], XFER + DO_4ROUNDS rsp + _XFER + SRND + 1*32 + add SRND, 2*32 + + vmovdqa X0, X2 + vmovdqa X1, X3 + + cmp SRND, 4 * 4*32 + jb loop2 + + mov CTX, [rsp + _CTX] + mov INP, [rsp + _INP] + + addm [4*0 + CTX],a + addm [4*1 + CTX],b + addm [4*2 + CTX],c + addm [4*3 + CTX],d + addm [4*4 + CTX],e + addm [4*5 + CTX],f + addm [4*6 + CTX],g + addm [4*7 + CTX],h + + cmp INP, [rsp + _INP_END] + ja done_hash + + ;;;; Do second block using previously scheduled results + xor SRND, SRND +align 16 +loop3: + DO_4ROUNDS rsp + _XFER + SRND + 0*32 + 16 + DO_4ROUNDS rsp + _XFER + SRND + 1*32 + 16 + add SRND, 2*32 + cmp SRND, 4 * 4*32 + jb loop3 + + mov CTX, [rsp + _CTX] + mov INP, [rsp + _INP] + add INP, 64 + + addm [4*0 + CTX],a + addm [4*1 + CTX],b + addm [4*2 + CTX],c + addm [4*3 + CTX],d + addm [4*4 + CTX],e + addm [4*5 + CTX],f + addm [4*6 + CTX],g + addm [4*7 + CTX],h + + cmp INP, [rsp + _INP_END] + jb loop0 + ja done_hash + +do_last_block: + ;;;; do last block + lea TBL,[K256 wrt rip] + + VMOVDQ XWORD0, [INP + 0*16] + VMOVDQ XWORD1, [INP + 1*16] + VMOVDQ XWORD2, [INP + 2*16] + VMOVDQ XWORD3, [INP + 3*16] + + vpshufb XWORD0, XWORD0, X_BYTE_FLIP_MASK + vpshufb XWORD1, XWORD1, X_BYTE_FLIP_MASK + vpshufb XWORD2, XWORD2, X_BYTE_FLIP_MASK + vpshufb XWORD3, XWORD3, X_BYTE_FLIP_MASK + + jmp last_block_enter + +only_one_block: + + ;; load initial digest + mov a,[4*0 + CTX] + mov b,[4*1 + CTX] + mov c,[4*2 + CTX] + mov d,[4*3 + CTX] + mov e,[4*4 + CTX] + mov f,[4*5 + CTX] + mov g,[4*6 + CTX] + mov h,[4*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] + vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip] + vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip] + + mov [rsp + _CTX], CTX + jmp do_last_block + +done_hash: +%ifdef WINABI + vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] + vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] + vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] + vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] + vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] + vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] + vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] + vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] +%endif + + mov rsp, [rsp + _RSP] + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp +%ifdef WINABI + pop rdi + pop rsi +%endif + pop rbx + + ret + +section .data +align 64 +K256: + dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + ddq 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 + +; shuffle xBxA -> 00BA +_SHUF_00BA: + ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 + +; shuffle xDxC -> DC00 +_SHUF_DC00: + ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha256_avx2_x86.asm b/src/Crypto/sha256_avx2_x86.asm new file mode 100644 index 00000000..31c8bd0d --- /dev/null +++ b/src/Crypto/sha256_avx2_x86.asm @@ -0,0 +1,10 @@ + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha256_sse4_x64.asm b/src/Crypto/sha256_sse4_x64.asm new file mode 100644 index 00000000..c11630bc --- /dev/null +++ b/src/Crypto/sha256_sse4_x64.asm @@ -0,0 +1,560 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm +; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; This code is described in an Intel White-Paper: +; "Fast SHA-256 Implementations on Intel Architecture Processors" +; +; To find it, surf to http://www.intel.com/p/en_US/embedded +; and search for that title. +; The paper is expected to be released roughly at the end of April, 2012 +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Modified by kerukuro for use in cppcrypto. + +; Modified By Mounir IDRASSI for use in VeraCrypt + +%define MOVDQ movdqu ;; assume buffers not aligned + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +; addm [mem], reg +; Add reg to mem using reg-mem add and store +%macro addm 2 + add %2, %1 + mov %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + MOVDQ %1, %2 + pshufb %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XTMP4 xmm8 +%define XFER xmm9 + +%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA +%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00 +%define BYTE_FLIP_MASK xmm12 + +%ifndef WINABI +%define NUM_BLKS rdx ; 3rd arg +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c ecx +%define d r8d +%define e edx +%else +%define NUM_BLKS r8 ; 3rd arg +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c edi +%define d esi +%define e r8d + +%endif +%define TBL rbp +%define a eax +%define b ebx + +%define f r9d +%define g r10d +%define h r11d + +%define y0 r13d +%define y1 r14d +%define y2 r15d + + + +_INP_END_SIZE equ 8 +_INP_SIZE equ 8 +_XFER_SIZE equ 8 +%ifndef WINABI +_XMM_SAVE_SIZE equ 0 +%else +_XMM_SAVE_SIZE equ 7*16 +%endif +; STACK_SIZE plus pushes must be an odd multiple of 8 +_ALIGN_SIZE equ 8 + +_INP_END equ 0 +_INP equ _INP_END + _INP_END_SIZE +_XFER equ _INP + _INP_SIZE +_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE +STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE + +; rotate_Xs +; Rotate values of symbols X0...X3 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro FOUR_ROUNDS_AND_SCHED 0 + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + movdqa XTMP0, X3 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + palignr XTMP0, X2, 4 ; XTMP0 = W[-7] + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + movdqa XTMP1, X1 + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + ;; compute s0 + palignr XTMP1, X0, 4 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH + movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pslld XTMP1, (32-7) + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + psrld XTMP2, 7 + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] + mov y0, e ; y0 = e + mov y1, a ; y1 = a + movdqa XTMP4, XTMP3 ; XTMP4 = W[-15] + ror y0, (25-11) ; y0 = e >> (25-11) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y1, (22-13) ; y1 = a >> (22-13) + pslld XTMP3, (32-18) + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + psrld XTMP2, 18 + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP1, XTMP3 + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3 + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pxor XTMP1, XTMP4 ; XTMP1 = s0 + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + ;; compute low s1 + pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} + mov y0, e ; y0 = e + mov y1, a ; y1 = a + ror y0, (25-11) ; y0 = e >> (25-11) + movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA} + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP2, XTMP3 + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH + pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + ;; compute high s1 + pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + movdqa X0, XTMP2 ; X0 = W[-2] {DDCC} + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC} + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + pxor XTMP2, XTMP3 + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH + pxor X0, XTMP2 ; X0 = s1 {xDxC} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb X0, SHUF_DC00 ; X0 = s1 {DC00} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 4] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +;; arg 3 : Num blocks +section .text +global sha256_sse4 +global _sha256_sse4 +align 32 +sha256_sse4: +_sha256_sse4: + push rbx +%ifdef WINABI + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_SIZE +%ifdef WINABI + movdqa [rsp + _XMM_SAVE + 0*16],xmm6 + movdqa [rsp + _XMM_SAVE + 1*16],xmm7 + movdqa [rsp + _XMM_SAVE + 2*16],xmm8 + movdqa [rsp + _XMM_SAVE + 3*16],xmm9 + movdqa [rsp + _XMM_SAVE + 4*16],xmm10 + movdqa [rsp + _XMM_SAVE + 5*16],xmm11 + movdqa [rsp + _XMM_SAVE + 6*16],xmm12 +%endif + + shl NUM_BLKS, 6 ; convert to bytes + jz done_hash + add NUM_BLKS, INP ; pointer to end of data + mov [rsp + _INP_END], NUM_BLKS + + ;; load initial digest + mov a,[4*0 + CTX] + mov b,[4*1 + CTX] + mov c,[4*2 + CTX] + mov d,[4*3 + CTX] + mov e,[4*4 + CTX] + mov f,[4*5 + CTX] + mov g,[4*6 + CTX] + mov h,[4*7 + CTX] + + movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] + movdqa SHUF_00BA, [_SHUF_00BA wrt rip] + movdqa SHUF_DC00, [_SHUF_DC00 wrt rip] + +loop0: + lea TBL,[K256 wrt rip] + + ;; byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + mov [rsp + _INP], INP + + ;; schedule 48 input dwords, by doing 3 rounds of 16 each + mov SRND, 3 +align 16 +loop1: + movdqa XFER, [TBL + 0*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 1*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 2*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 3*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 +loop2: + paddd X0, [TBL + 0*16] + movdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd X1, [TBL + 1*16] + movdqa [rsp + _XFER], X1 + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X0, X2 + movdqa X1, X3 + + sub SRND, 1 + jne loop2 + + addm [4*0 + CTX],a + addm [4*1 + CTX],b + addm [4*2 + CTX],c + addm [4*3 + CTX],d + addm [4*4 + CTX],e + addm [4*5 + CTX],f + addm [4*6 + CTX],g + addm [4*7 + CTX],h + + mov INP, [rsp + _INP] + add INP, 64 + cmp INP, [rsp + _INP_END] + jne loop0 + +done_hash: +%ifdef WINABI + movdqa xmm6,[rsp + _XMM_SAVE + 0*16] + movdqa xmm7,[rsp + _XMM_SAVE + 1*16] + movdqa xmm8,[rsp + _XMM_SAVE + 2*16] + movdqa xmm9,[rsp + _XMM_SAVE + 3*16] + movdqa xmm10,[rsp + _XMM_SAVE + 4*16] + movdqa xmm11,[rsp + _XMM_SAVE + 5*16] + movdqa xmm12,[rsp + _XMM_SAVE + 6*16] +%endif + + add rsp, STACK_SIZE + + pop r15 + pop r14 + pop r13 + pop rbp +%ifdef WINABI + pop rdi + pop rsi +%endif + pop rbx + + ret + + +section .data +align 64 +K256: + dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203 + +; shuffle xBxA -> 00BA +_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +; shuffle xDxC -> DC00 +_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha256_sse4_x86.asm b/src/Crypto/sha256_sse4_x86.asm new file mode 100644 index 00000000..31c8bd0d --- /dev/null +++ b/src/Crypto/sha256_sse4_x86.asm @@ -0,0 +1,10 @@ + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha512-x64-nayuki.S b/src/Crypto/sha512-x64-nayuki.S new file mode 100644 index 00000000..0e36ac91 --- /dev/null +++ b/src/Crypto/sha512-x64-nayuki.S @@ -0,0 +1,202 @@ +/* + * SHA-512 hash in x86-64 assembly + * + * Copyright (c) 2017 Project Nayuki. (MIT License) + * https://www.nayuki.io/page/fast-sha2-hashes-in-x86-assembly + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * - The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * - The Software is provided "as is", without warranty of any kind, express or + * implied, including but not limited to the warranties of merchantability, + * fitness for a particular purpose and noninfringement. In no event shall the + * authors or copyright holders be liable for any claim, damages or other + * liability, whether in an action of contract, tort or otherwise, arising from, + * out of or in connection with the Software or the use or other dealings in the + * Software. + */ + +# Adapted for VeraCrypt +# Adapt to Windows calling convention when building on Windows. +# avoid using xmm6 register since it must be preserved on Windows. We use MMX registers instead. + + +/* void sha512_compress_nayuki(uint64_t state[8], const uint8_t block[128]) */ +.globl sha512_compress_nayuki +.globl _sha512_compress_nayuki +sha512_compress_nayuki: +_sha512_compress_nayuki: + /* + * Storage usage: + * Bytes Location Description + * 8 rax Temporary for calculation per round + * 8 rbx Temporary for calculation per round + * 8 rcx Temporary for calculation per round + * 8 rdx Temporary for calculation per round + * 8 rsi Base address of block array argument (read-only) + * 8 rdi Base address of state array argument (read-only) + * 8 rsp x86-64 stack pointer + * 8 r8 SHA-512 state variable A + * 8 r9 SHA-512 state variable B + * 8 r10 SHA-512 state variable C + * 8 r11 SHA-512 state variable D + * 8 r12 SHA-512 state variable E + * 8 r13 SHA-512 state variable F + * 8 r14 SHA-512 state variable G + * 8 r15 SHA-512 state variable H + * 128 [rsp+0] Circular buffer of most recent 16 key schedule items, 8 bytes each + * 16 xmm0 Caller's value of r10 (only low 64 bits are used) + * 16 xmm1 Caller's value of r11 (only low 64 bits are used) + * 16 xmm2 Caller's value of r12 (only low 64 bits are used) + * 16 xmm3 Caller's value of r13 (only low 64 bits are used) + * 16 xmm4 Caller's value of r14 (only low 64 bits are used) + * 16 xmm5 Caller's value of r15 (only low 64 bits are used) + * 8 mm0 Caller's value of rbx + */ + movq %r10, %xmm0 + movq %r11, %xmm1 + movq %r12, %xmm2 + movq %r13, %xmm3 + movq %r14, %xmm4 + movq %r15, %xmm5 + movq %rbx, %mm0 +.ifdef WINABI + movq %rdi, %mm1 + movq %rsi, %mm2 + movq %rcx, %rdi + movq %rdx, %rsi +.endif + subq $128, %rsp + + + movq 0(%rdi), %r8 + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + movq 32(%rdi), %r12 + movq 40(%rdi), %r13 + movq 48(%rdi), %r14 + movq 56(%rdi), %r15 + + + movq (0*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((0)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x428A2F98D728AE22, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15; + movq (1*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((1)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x7137449123EF65CD, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14; + movq (2*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((2)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0xB5C0FBCFEC4D3B2F, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13; + movq (3*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((3)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0xE9B5DBA58189DBBC, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12; + movq (4*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((4)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x3956C25BF348B538, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11; + movq (5*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((5)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x59F111F1B605D019, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10; + movq (6*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((6)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x923F82A4AF194F9B, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9; + movq (7*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((7)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0xAB1C5ED5DA6D8118, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8; + movq (8*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((8)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0xD807AA98A3030242, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15; + movq (9*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((9)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x12835B0145706FBE, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14; + movq (10*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((10)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x243185BE4EE4B28C, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13; + movq (11*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((11)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x550C7DC3D5FFB4E2, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12; + movq (12*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((12)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x72BE5D74F27B896F, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11; + movq (13*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((13)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x80DEB1FE3B1696B1, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10; + movq (14*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((14)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x9BDC06A725C71235, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9; + movq (15*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((15)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0xC19BF174CF692694, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8; + movq (((16 -15)&0xF)*8)(%rsp), %rax; movq (((16 -16)&0xF)*8)(%rsp), %rbx; addq (((16 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((16 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((16)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0xE49B69C19EF14AD2, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15; + movq (((17 -15)&0xF)*8)(%rsp), %rax; movq (((17 -16)&0xF)*8)(%rsp), %rbx; addq (((17 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((17 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((17)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0xEFBE4786384F25E3, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14; + movq (((18 -15)&0xF)*8)(%rsp), %rax; movq (((18 -16)&0xF)*8)(%rsp), %rbx; addq (((18 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((18 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((18)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x0FC19DC68B8CD5B5, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13; + movq (((19 -15)&0xF)*8)(%rsp), %rax; movq (((19 -16)&0xF)*8)(%rsp), %rbx; addq (((19 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((19 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((19)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x240CA1CC77AC9C65, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12; + movq (((20 -15)&0xF)*8)(%rsp), %rax; movq (((20 -16)&0xF)*8)(%rsp), %rbx; addq (((20 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((20 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((20)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x2DE92C6F592B0275, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11; + movq (((21 -15)&0xF)*8)(%rsp), %rax; movq (((21 -16)&0xF)*8)(%rsp), %rbx; addq (((21 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((21 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((21)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x4A7484AA6EA6E483, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10; + movq (((22 -15)&0xF)*8)(%rsp), %rax; movq (((22 -16)&0xF)*8)(%rsp), %rbx; addq (((22 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((22 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((22)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x5CB0A9DCBD41FBD4, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9; + movq (((23 -15)&0xF)*8)(%rsp), %rax; movq (((23 -16)&0xF)*8)(%rsp), %rbx; addq (((23 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((23 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((23)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x76F988DA831153B5, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8; + movq (((24 -15)&0xF)*8)(%rsp), %rax; movq (((24 -16)&0xF)*8)(%rsp), %rbx; addq (((24 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((24 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((24)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x983E5152EE66DFAB, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15; + movq (((25 -15)&0xF)*8)(%rsp), %rax; movq (((25 -16)&0xF)*8)(%rsp), %rbx; addq (((25 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((25 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((25)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0xA831C66D2DB43210, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14; + movq (((26 -15)&0xF)*8)(%rsp), %rax; movq (((26 -16)&0xF)*8)(%rsp), %rbx; addq (((26 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((26 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((26)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0xB00327C898FB213F, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13; + movq (((27 -15)&0xF)*8)(%rsp), %rax; movq (((27 -16)&0xF)*8)(%rsp), %rbx; addq (((27 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((27 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((27)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0xBF597FC7BEEF0EE4, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12; + movq (((28 -15)&0xF)*8)(%rsp), %rax; movq (((28 -16)&0xF)*8)(%rsp), %rbx; addq (((28 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((28 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((28)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0xC6E00BF33DA88FC2, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11; + movq (((29 -15)&0xF)*8)(%rsp), %rax; movq (((29 -16)&0xF)*8)(%rsp), %rbx; addq (((29 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((29 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((29)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0xD5A79147930AA725, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10; + movq (((30 -15)&0xF)*8)(%rsp), %rax; movq (((30 -16)&0xF)*8)(%rsp), %rbx; addq (((30 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((30 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((30)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x06CA6351E003826F, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9; + movq (((31 -15)&0xF)*8)(%rsp), %rax; movq (((31 -16)&0xF)*8)(%rsp), %rbx; addq (((31 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((31 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((31)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x142929670A0E6E70, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8; + movq (((32 -15)&0xF)*8)(%rsp), %rax; movq (((32 -16)&0xF)*8)(%rsp), %rbx; addq (((32 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((32 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((32)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x27B70A8546D22FFC, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15; + movq (((33 -15)&0xF)*8)(%rsp), %rax; movq (((33 -16)&0xF)*8)(%rsp), %rbx; addq (((33 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((33 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((33)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x2E1B21385C26C926, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14; + movq (((34 -15)&0xF)*8)(%rsp), %rax; movq (((34 -16)&0xF)*8)(%rsp), %rbx; addq (((34 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((34 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((34)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x4D2C6DFC5AC42AED, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13; + movq (((35 -15)&0xF)*8)(%rsp), %rax; movq (((35 -16)&0xF)*8)(%rsp), %rbx; addq (((35 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((35 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((35)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x53380D139D95B3DF, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12; + movq (((36 -15)&0xF)*8)(%rsp), %rax; movq (((36 -16)&0xF)*8)(%rsp), %rbx; addq (((36 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((36 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((36)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x650A73548BAF63DE, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11; + movq (((37 -15)&0xF)*8)(%rsp), %rax; movq (((37 -16)&0xF)*8)(%rsp), %rbx; addq (((37 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((37 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((37)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x766A0ABB3C77B2A8, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10; + movq (((38 -15)&0xF)*8)(%rsp), %rax; movq (((38 -16)&0xF)*8)(%rsp), %rbx; addq (((38 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((38 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((38)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x81C2C92E47EDAEE6, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9; + movq (((39 -15)&0xF)*8)(%rsp), %rax; movq (((39 -16)&0xF)*8)(%rsp), %rbx; addq (((39 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((39 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((39)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x92722C851482353B, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8; + movq (((40 -15)&0xF)*8)(%rsp), %rax; movq (((40 -16)&0xF)*8)(%rsp), %rbx; addq (((40 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((40 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((40)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0xA2BFE8A14CF10364, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15; + movq (((41 -15)&0xF)*8)(%rsp), %rax; movq (((41 -16)&0xF)*8)(%rsp), %rbx; addq (((41 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((41 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((41)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0xA81A664BBC423001, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14; + movq (((42 -15)&0xF)*8)(%rsp), %rax; movq (((42 -16)&0xF)*8)(%rsp), %rbx; addq (((42 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((42 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((42)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0xC24B8B70D0F89791, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13; + movq (((43 -15)&0xF)*8)(%rsp), %rax; movq (((43 -16)&0xF)*8)(%rsp), %rbx; addq (((43 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((43 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((43)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0xC76C51A30654BE30, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12; + movq (((44 -15)&0xF)*8)(%rsp), %rax; movq (((44 -16)&0xF)*8)(%rsp), %rbx; addq (((44 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((44 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((44)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0xD192E819D6EF5218, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11; + movq (((45 -15)&0xF)*8)(%rsp), %rax; movq (((45 -16)&0xF)*8)(%rsp), %rbx; addq (((45 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((45 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((45)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0xD69906245565A910, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10; + movq (((46 -15)&0xF)*8)(%rsp), %rax; movq (((46 -16)&0xF)*8)(%rsp), %rbx; addq (((46 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((46 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((46)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0xF40E35855771202A, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9; + movq (((47 -15)&0xF)*8)(%rsp), %rax; movq (((47 -16)&0xF)*8)(%rsp), %rbx; addq (((47 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((47 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((47)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x106AA07032BBD1B8, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8; + movq (((48 -15)&0xF)*8)(%rsp), %rax; movq (((48 -16)&0xF)*8)(%rsp), %rbx; addq (((48 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((48 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((48)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x19A4C116B8D2D0C8, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15; + movq (((49 -15)&0xF)*8)(%rsp), %rax; movq (((49 -16)&0xF)*8)(%rsp), %rbx; addq (((49 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((49 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((49)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x1E376C085141AB53, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14; + movq (((50 -15)&0xF)*8)(%rsp), %rax; movq (((50 -16)&0xF)*8)(%rsp), %rbx; addq (((50 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((50 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((50)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x2748774CDF8EEB99, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13; + movq (((51 -15)&0xF)*8)(%rsp), %rax; movq (((51 -16)&0xF)*8)(%rsp), %rbx; addq (((51 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((51 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((51)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x34B0BCB5E19B48A8, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12; + movq (((52 -15)&0xF)*8)(%rsp), %rax; movq (((52 -16)&0xF)*8)(%rsp), %rbx; addq (((52 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((52 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((52)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x391C0CB3C5C95A63, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11; + movq (((53 -15)&0xF)*8)(%rsp), %rax; movq (((53 -16)&0xF)*8)(%rsp), %rbx; addq (((53 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((53 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((53)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x4ED8AA4AE3418ACB, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10; + movq (((54 -15)&0xF)*8)(%rsp), %rax; movq (((54 -16)&0xF)*8)(%rsp), %rbx; addq (((54 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((54 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((54)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x5B9CCA4F7763E373, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9; + movq (((55 -15)&0xF)*8)(%rsp), %rax; movq (((55 -16)&0xF)*8)(%rsp), %rbx; addq (((55 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((55 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((55)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x682E6FF3D6B2B8A3, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8; + movq (((56 -15)&0xF)*8)(%rsp), %rax; movq (((56 -16)&0xF)*8)(%rsp), %rbx; addq (((56 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((56 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((56)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x748F82EE5DEFB2FC, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15; + movq (((57 -15)&0xF)*8)(%rsp), %rax; movq (((57 -16)&0xF)*8)(%rsp), %rbx; addq (((57 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((57 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((57)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x78A5636F43172F60, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14; + movq (((58 -15)&0xF)*8)(%rsp), %rax; movq (((58 -16)&0xF)*8)(%rsp), %rbx; addq (((58 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((58 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((58)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x84C87814A1F0AB72, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13; + movq (((59 -15)&0xF)*8)(%rsp), %rax; movq (((59 -16)&0xF)*8)(%rsp), %rbx; addq (((59 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((59 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((59)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x8CC702081A6439EC, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12; + movq (((60 -15)&0xF)*8)(%rsp), %rax; movq (((60 -16)&0xF)*8)(%rsp), %rbx; addq (((60 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((60 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((60)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x90BEFFFA23631E28, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11; + movq (((61 -15)&0xF)*8)(%rsp), %rax; movq (((61 -16)&0xF)*8)(%rsp), %rbx; addq (((61 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((61 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((61)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0xA4506CEBDE82BDE9, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10; + movq (((62 -15)&0xF)*8)(%rsp), %rax; movq (((62 -16)&0xF)*8)(%rsp), %rbx; addq (((62 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((62 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((62)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0xBEF9A3F7B2C67915, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9; + movq (((63 -15)&0xF)*8)(%rsp), %rax; movq (((63 -16)&0xF)*8)(%rsp), %rbx; addq (((63 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((63 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((63)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0xC67178F2E372532B, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8; + movq (((64 -15)&0xF)*8)(%rsp), %rax; movq (((64 -16)&0xF)*8)(%rsp), %rbx; addq (((64 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((64 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((64)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0xCA273ECEEA26619C, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15; + movq (((65 -15)&0xF)*8)(%rsp), %rax; movq (((65 -16)&0xF)*8)(%rsp), %rbx; addq (((65 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((65 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((65)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0xD186B8C721C0C207, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14; + movq (((66 -15)&0xF)*8)(%rsp), %rax; movq (((66 -16)&0xF)*8)(%rsp), %rbx; addq (((66 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((66 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((66)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0xEADA7DD6CDE0EB1E, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13; + movq (((67 -15)&0xF)*8)(%rsp), %rax; movq (((67 -16)&0xF)*8)(%rsp), %rbx; addq (((67 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((67 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((67)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0xF57D4F7FEE6ED178, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12; + movq (((68 -15)&0xF)*8)(%rsp), %rax; movq (((68 -16)&0xF)*8)(%rsp), %rbx; addq (((68 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((68 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((68)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x06F067AA72176FBA, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11; + movq (((69 -15)&0xF)*8)(%rsp), %rax; movq (((69 -16)&0xF)*8)(%rsp), %rbx; addq (((69 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((69 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((69)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x0A637DC5A2C898A6, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10; + movq (((70 -15)&0xF)*8)(%rsp), %rax; movq (((70 -16)&0xF)*8)(%rsp), %rbx; addq (((70 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((70 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((70)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x113F9804BEF90DAE, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9; + movq (((71 -15)&0xF)*8)(%rsp), %rax; movq (((71 -16)&0xF)*8)(%rsp), %rbx; addq (((71 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((71 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((71)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x1B710B35131C471B, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8; + movq (((72 -15)&0xF)*8)(%rsp), %rax; movq (((72 -16)&0xF)*8)(%rsp), %rbx; addq (((72 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((72 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((72)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x28DB77F523047D84, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15; + movq (((73 -15)&0xF)*8)(%rsp), %rax; movq (((73 -16)&0xF)*8)(%rsp), %rbx; addq (((73 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((73 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((73)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x32CAAB7B40C72493, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14; + movq (((74 -15)&0xF)*8)(%rsp), %rax; movq (((74 -16)&0xF)*8)(%rsp), %rbx; addq (((74 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((74 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((74)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x3C9EBE0A15C9BEBC, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13; + movq (((75 -15)&0xF)*8)(%rsp), %rax; movq (((75 -16)&0xF)*8)(%rsp), %rbx; addq (((75 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((75 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((75)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x431D67C49C100D4C, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12; + movq (((76 -15)&0xF)*8)(%rsp), %rax; movq (((76 -16)&0xF)*8)(%rsp), %rbx; addq (((76 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((76 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((76)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x4CC5D4BECB3E42B6, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11; + movq (((77 -15)&0xF)*8)(%rsp), %rax; movq (((77 -16)&0xF)*8)(%rsp), %rbx; addq (((77 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((77 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((77)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x597F299CFC657E2A, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10; + movq (((78 -15)&0xF)*8)(%rsp), %rax; movq (((78 -16)&0xF)*8)(%rsp), %rbx; addq (((78 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((78 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((78)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x5FCB6FAB3AD6FAEC, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9; + movq (((79 -15)&0xF)*8)(%rsp), %rax; movq (((79 -16)&0xF)*8)(%rsp), %rbx; addq (((79 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((79 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((79)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x6C44198C4A475817, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8; + + + addq %r8 , 0(%rdi) + addq %r9 , 8(%rdi) + addq %r10, 16(%rdi) + addq %r11, 24(%rdi) + addq %r12, 32(%rdi) + addq %r13, 40(%rdi) + addq %r14, 48(%rdi) + addq %r15, 56(%rdi) + + + movq %xmm0, %r10 + movq %xmm1, %r11 + movq %xmm2, %r12 + movq %xmm3, %r13 + movq %xmm4, %r14 + movq %xmm5, %r15 + movq %mm0, %rbx + +.ifdef WINABI + movq %mm1, %rdi + movq %mm2, %rsi +.endif + + emms + + addq $128, %rsp + + retq + + .ifndef WINABI +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + .endif diff --git a/src/Crypto/sha512-x86-nayuki.S b/src/Crypto/sha512-x86-nayuki.S new file mode 100644 index 00000000..dcbebf7a --- /dev/null +++ b/src/Crypto/sha512-x86-nayuki.S @@ -0,0 +1,180 @@ +# +# SHA-512 hash in x86 assembly +# +# Copyright (c) 2014 Project Nayuki +# http://www.nayuki.io/page/fast-sha2-hashes-in-x86-assembly +# +# (MIT License) +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# - The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# - The Software is provided "as is", without warranty of any kind, express or +# implied, including but not limited to the warranties of merchantability, +# fitness for a particular purpose and noninfringement. In no event shall the +# authors or copyright holders be liable for any claim, damages or other +# liability, whether in an action of contract, tort or otherwise, arising from, +# out of or in connection with the Software or the use or other dealings in the +# Software. +# + +# Modified by kerukuro for use in cppcrypto. + + .ifdef MS_STDCALL + .globl _sha512_compress_nayuki@8 + _sha512_compress_nayuki@8: + .else + .globl _sha512_compress_nayuki + .globl sha512_compress_nayuki + _sha512_compress_nayuki: + sha512_compress_nayuki: + .endif + + movl %esp, %ecx + subl $192, %esp + andl $~0xF, %esp + + + movl 4(%ecx), %eax + movdqu 0(%eax), %xmm0; movdqu %xmm0, 0(%esp) + movdqu 16(%eax), %xmm1; movdqu %xmm1, 16(%esp) + movdqu 32(%eax), %xmm2; movdqu %xmm2, 32(%esp) + movdqu 48(%eax), %xmm3; movdqu %xmm3, 48(%esp) + + + movl 8(%ecx), %eax + movq .bswap64, %mm7 + movq (0*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((0)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+0*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp); + movq (1*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((1)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+1*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp); + movq (2*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((2)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+2*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp); + movq (3*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((3)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+3*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp); + movq (4*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((4)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+4*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp); + movq (5*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((5)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+5*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp); + movq (6*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((6)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+6*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp); + movq (7*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((7)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+7*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp); + movq (8*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((8)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+8*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp); + movq (9*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((9)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+9*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp); + movq (10*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((10)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+10*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp); + movq (11*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((11)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+11*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp); + movq (12*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((12)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+12*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp); + movq (13*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((13)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+13*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp); + movq (14*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((14)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+14*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp); + movq (15*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((15)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+15*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp); + movq (((16 -16)&0xF)*8+64)(%esp), %mm0; paddq (((16 - 7)&0xF)*8+64)(%esp), %mm0; movq (((16 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((16 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((16)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+16*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp); + movq (((17 -16)&0xF)*8+64)(%esp), %mm0; paddq (((17 - 7)&0xF)*8+64)(%esp), %mm0; movq (((17 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((17 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((17)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+17*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp); + movq (((18 -16)&0xF)*8+64)(%esp), %mm0; paddq (((18 - 7)&0xF)*8+64)(%esp), %mm0; movq (((18 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((18 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((18)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+18*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp); + movq (((19 -16)&0xF)*8+64)(%esp), %mm0; paddq (((19 - 7)&0xF)*8+64)(%esp), %mm0; movq (((19 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((19 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((19)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+19*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp); + movq (((20 -16)&0xF)*8+64)(%esp), %mm0; paddq (((20 - 7)&0xF)*8+64)(%esp), %mm0; movq (((20 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((20 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((20)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+20*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp); + movq (((21 -16)&0xF)*8+64)(%esp), %mm0; paddq (((21 - 7)&0xF)*8+64)(%esp), %mm0; movq (((21 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((21 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((21)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+21*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp); + movq (((22 -16)&0xF)*8+64)(%esp), %mm0; paddq (((22 - 7)&0xF)*8+64)(%esp), %mm0; movq (((22 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((22 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((22)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+22*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp); + movq (((23 -16)&0xF)*8+64)(%esp), %mm0; paddq (((23 - 7)&0xF)*8+64)(%esp), %mm0; movq (((23 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((23 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((23)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+23*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp); + movq (((24 -16)&0xF)*8+64)(%esp), %mm0; paddq (((24 - 7)&0xF)*8+64)(%esp), %mm0; movq (((24 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((24 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((24)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+24*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp); + movq (((25 -16)&0xF)*8+64)(%esp), %mm0; paddq (((25 - 7)&0xF)*8+64)(%esp), %mm0; movq (((25 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((25 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((25)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+25*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp); + movq (((26 -16)&0xF)*8+64)(%esp), %mm0; paddq (((26 - 7)&0xF)*8+64)(%esp), %mm0; movq (((26 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((26 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((26)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+26*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp); + movq (((27 -16)&0xF)*8+64)(%esp), %mm0; paddq (((27 - 7)&0xF)*8+64)(%esp), %mm0; movq (((27 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((27 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((27)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+27*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp); + movq (((28 -16)&0xF)*8+64)(%esp), %mm0; paddq (((28 - 7)&0xF)*8+64)(%esp), %mm0; movq (((28 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((28 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((28)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+28*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp); + movq (((29 -16)&0xF)*8+64)(%esp), %mm0; paddq (((29 - 7)&0xF)*8+64)(%esp), %mm0; movq (((29 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((29 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((29)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+29*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp); + movq (((30 -16)&0xF)*8+64)(%esp), %mm0; paddq (((30 - 7)&0xF)*8+64)(%esp), %mm0; movq (((30 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((30 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((30)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+30*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp); + movq (((31 -16)&0xF)*8+64)(%esp), %mm0; paddq (((31 - 7)&0xF)*8+64)(%esp), %mm0; movq (((31 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((31 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((31)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+31*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp); + movq (((32 -16)&0xF)*8+64)(%esp), %mm0; paddq (((32 - 7)&0xF)*8+64)(%esp), %mm0; movq (((32 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((32 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((32)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+32*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp); + movq (((33 -16)&0xF)*8+64)(%esp), %mm0; paddq (((33 - 7)&0xF)*8+64)(%esp), %mm0; movq (((33 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((33 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((33)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+33*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp); + movq (((34 -16)&0xF)*8+64)(%esp), %mm0; paddq (((34 - 7)&0xF)*8+64)(%esp), %mm0; movq (((34 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((34 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((34)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+34*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp); + movq (((35 -16)&0xF)*8+64)(%esp), %mm0; paddq (((35 - 7)&0xF)*8+64)(%esp), %mm0; movq (((35 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((35 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((35)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+35*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp); + movq (((36 -16)&0xF)*8+64)(%esp), %mm0; paddq (((36 - 7)&0xF)*8+64)(%esp), %mm0; movq (((36 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((36 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((36)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+36*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp); + movq (((37 -16)&0xF)*8+64)(%esp), %mm0; paddq (((37 - 7)&0xF)*8+64)(%esp), %mm0; movq (((37 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((37 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((37)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+37*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp); + movq (((38 -16)&0xF)*8+64)(%esp), %mm0; paddq (((38 - 7)&0xF)*8+64)(%esp), %mm0; movq (((38 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((38 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((38)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+38*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp); + movq (((39 -16)&0xF)*8+64)(%esp), %mm0; paddq (((39 - 7)&0xF)*8+64)(%esp), %mm0; movq (((39 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((39 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((39)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+39*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp); + movq (((40 -16)&0xF)*8+64)(%esp), %mm0; paddq (((40 - 7)&0xF)*8+64)(%esp), %mm0; movq (((40 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((40 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((40)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+40*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp); + movq (((41 -16)&0xF)*8+64)(%esp), %mm0; paddq (((41 - 7)&0xF)*8+64)(%esp), %mm0; movq (((41 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((41 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((41)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+41*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp); + movq (((42 -16)&0xF)*8+64)(%esp), %mm0; paddq (((42 - 7)&0xF)*8+64)(%esp), %mm0; movq (((42 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((42 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((42)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+42*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp); + movq (((43 -16)&0xF)*8+64)(%esp), %mm0; paddq (((43 - 7)&0xF)*8+64)(%esp), %mm0; movq (((43 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((43 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((43)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+43*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp); + movq (((44 -16)&0xF)*8+64)(%esp), %mm0; paddq (((44 - 7)&0xF)*8+64)(%esp), %mm0; movq (((44 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((44 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((44)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+44*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp); + movq (((45 -16)&0xF)*8+64)(%esp), %mm0; paddq (((45 - 7)&0xF)*8+64)(%esp), %mm0; movq (((45 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((45 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((45)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+45*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp); + movq (((46 -16)&0xF)*8+64)(%esp), %mm0; paddq (((46 - 7)&0xF)*8+64)(%esp), %mm0; movq (((46 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((46 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((46)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+46*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp); + movq (((47 -16)&0xF)*8+64)(%esp), %mm0; paddq (((47 - 7)&0xF)*8+64)(%esp), %mm0; movq (((47 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((47 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((47)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+47*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp); + movq (((48 -16)&0xF)*8+64)(%esp), %mm0; paddq (((48 - 7)&0xF)*8+64)(%esp), %mm0; movq (((48 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((48 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((48)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+48*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp); + movq (((49 -16)&0xF)*8+64)(%esp), %mm0; paddq (((49 - 7)&0xF)*8+64)(%esp), %mm0; movq (((49 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((49 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((49)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+49*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp); + movq (((50 -16)&0xF)*8+64)(%esp), %mm0; paddq (((50 - 7)&0xF)*8+64)(%esp), %mm0; movq (((50 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((50 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((50)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+50*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp); + movq (((51 -16)&0xF)*8+64)(%esp), %mm0; paddq (((51 - 7)&0xF)*8+64)(%esp), %mm0; movq (((51 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((51 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((51)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+51*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp); + movq (((52 -16)&0xF)*8+64)(%esp), %mm0; paddq (((52 - 7)&0xF)*8+64)(%esp), %mm0; movq (((52 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((52 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((52)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+52*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp); + movq (((53 -16)&0xF)*8+64)(%esp), %mm0; paddq (((53 - 7)&0xF)*8+64)(%esp), %mm0; movq (((53 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((53 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((53)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+53*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp); + movq (((54 -16)&0xF)*8+64)(%esp), %mm0; paddq (((54 - 7)&0xF)*8+64)(%esp), %mm0; movq (((54 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((54 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((54)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+54*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp); + movq (((55 -16)&0xF)*8+64)(%esp), %mm0; paddq (((55 - 7)&0xF)*8+64)(%esp), %mm0; movq (((55 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((55 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((55)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+55*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp); + movq (((56 -16)&0xF)*8+64)(%esp), %mm0; paddq (((56 - 7)&0xF)*8+64)(%esp), %mm0; movq (((56 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((56 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((56)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+56*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp); + movq (((57 -16)&0xF)*8+64)(%esp), %mm0; paddq (((57 - 7)&0xF)*8+64)(%esp), %mm0; movq (((57 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((57 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((57)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+57*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp); + movq (((58 -16)&0xF)*8+64)(%esp), %mm0; paddq (((58 - 7)&0xF)*8+64)(%esp), %mm0; movq (((58 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((58 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((58)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+58*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp); + movq (((59 -16)&0xF)*8+64)(%esp), %mm0; paddq (((59 - 7)&0xF)*8+64)(%esp), %mm0; movq (((59 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((59 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((59)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+59*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp); + movq (((60 -16)&0xF)*8+64)(%esp), %mm0; paddq (((60 - 7)&0xF)*8+64)(%esp), %mm0; movq (((60 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((60 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((60)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+60*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp); + movq (((61 -16)&0xF)*8+64)(%esp), %mm0; paddq (((61 - 7)&0xF)*8+64)(%esp), %mm0; movq (((61 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((61 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((61)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+61*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp); + movq (((62 -16)&0xF)*8+64)(%esp), %mm0; paddq (((62 - 7)&0xF)*8+64)(%esp), %mm0; movq (((62 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((62 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((62)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+62*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp); + movq (((63 -16)&0xF)*8+64)(%esp), %mm0; paddq (((63 - 7)&0xF)*8+64)(%esp), %mm0; movq (((63 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((63 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((63)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+63*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp); + movq (((64 -16)&0xF)*8+64)(%esp), %mm0; paddq (((64 - 7)&0xF)*8+64)(%esp), %mm0; movq (((64 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((64 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((64)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+64*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp); + movq (((65 -16)&0xF)*8+64)(%esp), %mm0; paddq (((65 - 7)&0xF)*8+64)(%esp), %mm0; movq (((65 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((65 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((65)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+65*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp); + movq (((66 -16)&0xF)*8+64)(%esp), %mm0; paddq (((66 - 7)&0xF)*8+64)(%esp), %mm0; movq (((66 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((66 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((66)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+66*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp); + movq (((67 -16)&0xF)*8+64)(%esp), %mm0; paddq (((67 - 7)&0xF)*8+64)(%esp), %mm0; movq (((67 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((67 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((67)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+67*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp); + movq (((68 -16)&0xF)*8+64)(%esp), %mm0; paddq (((68 - 7)&0xF)*8+64)(%esp), %mm0; movq (((68 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((68 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((68)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+68*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp); + movq (((69 -16)&0xF)*8+64)(%esp), %mm0; paddq (((69 - 7)&0xF)*8+64)(%esp), %mm0; movq (((69 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((69 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((69)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+69*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp); + movq (((70 -16)&0xF)*8+64)(%esp), %mm0; paddq (((70 - 7)&0xF)*8+64)(%esp), %mm0; movq (((70 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((70 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((70)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+70*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp); + movq (((71 -16)&0xF)*8+64)(%esp), %mm0; paddq (((71 - 7)&0xF)*8+64)(%esp), %mm0; movq (((71 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((71 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((71)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+71*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp); + movq (((72 -16)&0xF)*8+64)(%esp), %mm0; paddq (((72 - 7)&0xF)*8+64)(%esp), %mm0; movq (((72 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((72 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((72)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+72*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp); + movq (((73 -16)&0xF)*8+64)(%esp), %mm0; paddq (((73 - 7)&0xF)*8+64)(%esp), %mm0; movq (((73 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((73 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((73)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+73*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp); + movq (((74 -16)&0xF)*8+64)(%esp), %mm0; paddq (((74 - 7)&0xF)*8+64)(%esp), %mm0; movq (((74 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((74 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((74)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+74*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp); + movq (((75 -16)&0xF)*8+64)(%esp), %mm0; paddq (((75 - 7)&0xF)*8+64)(%esp), %mm0; movq (((75 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((75 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((75)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+75*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp); + movq (((76 -16)&0xF)*8+64)(%esp), %mm0; paddq (((76 - 7)&0xF)*8+64)(%esp), %mm0; movq (((76 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((76 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((76)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+76*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp); + movq (((77 -16)&0xF)*8+64)(%esp), %mm0; paddq (((77 - 7)&0xF)*8+64)(%esp), %mm0; movq (((77 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((77 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((77)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+77*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp); + movq (((78 -16)&0xF)*8+64)(%esp), %mm0; paddq (((78 - 7)&0xF)*8+64)(%esp), %mm0; movq (((78 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((78 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((78)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+78*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp); + movq (((79 -16)&0xF)*8+64)(%esp), %mm0; paddq (((79 - 7)&0xF)*8+64)(%esp), %mm0; movq (((79 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((79 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((79)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+79*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp); + + + movl 4(%ecx), %eax + movdqu 0(%eax), %xmm0; paddq 0(%esp), %xmm0; movdqu %xmm0, 0(%eax) + movdqu 16(%eax), %xmm1; paddq 16(%esp), %xmm1; movdqu %xmm1, 16(%eax) + movdqu 32(%eax), %xmm2; paddq 32(%esp), %xmm2; movdqu %xmm2, 32(%eax) + movdqu 48(%eax), %xmm3; paddq 48(%esp), %xmm3; movdqu %xmm3, 48(%eax) + + + emms + movl %ecx, %esp + + .ifdef MS_STDCALL + ret $8 + .else + retl + .endif + + +.balign 8 +.bswap64: + .quad 0x0001020304050607 + +.roundconstants: + .quad 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC + .quad 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118 + .quad 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2 + .quad 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694 + .quad 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65 + .quad 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5 + .quad 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4 + .quad 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70 + .quad 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF + .quad 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B + .quad 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30 + .quad 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8 + .quad 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8 + .quad 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3 + .quad 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC + .quad 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B + .quad 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178 + .quad 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B + .quad 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C + .quad 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817 + + .ifndef WINABI +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + .endif \ No newline at end of file diff --git a/src/Crypto/sha512_avx1_x64.asm b/src/Crypto/sha512_avx1_x64.asm new file mode 100644 index 00000000..06321b5b --- /dev/null +++ b/src/Crypto/sha512_avx1_x64.asm @@ -0,0 +1,427 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: yasm -f x64 -D WINABI sha512_avx.asm +; Linux: yasm -f elf64 sha512_avx.asm +; + +BITS 64 +section .text + +; Virtual Registers +%ifdef WINABI + %define msg rcx ; ARG1 + %define digest rdx ; ARG2 + %define msglen r8 ; ARG3 + %define T1 rsi + %define T2 rdi +%else + %define msg rdi ; ARG1 + %define digest rsi ; ARG2 + %define msglen rdx ; ARG3 + %define T1 rcx + %define T2 r8 +%endif +%define a_64 r9 +%define b_64 r10 +%define c_64 r11 +%define d_64 r12 +%define e_64 r13 +%define f_64 r14 +%define g_64 r15 +%define h_64 rbx +%define tmp0 rax + +; Local variables (stack frame) +; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP +struc frame + .W: resq 80 ; Message Schedule + .WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1] + +%ifdef WINABI + .XMMSAVE: resdq 4 + .GPRSAVE: resq 7 +%else + .GPRSAVE: resq 5 +%endif +endstruc + +; Useful QWORD "arrays" for simpler memory references +%define MSG(i) msg + 8*(i) ; Input message (arg1) +%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2) +%define K_t(i) K512 + 8*(i) wrt rip ; SHA Constants (static mem) +%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame) +%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame) +; MSG, DIGEST, K_t, W_t are arrays +; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even + +%macro RotateState 0 + ; Rotate symbles a..h right + %xdefine %%TMP h_64 + %xdefine h_64 g_64 + %xdefine g_64 f_64 + %xdefine f_64 e_64 + %xdefine e_64 d_64 + %xdefine d_64 c_64 + %xdefine c_64 b_64 + %xdefine b_64 a_64 + %xdefine a_64 %%TMP +%endmacro + +%macro RORQ 2 + ; shld is faster than ror on Sandybridge + shld %1, %1, (64 - %2) +%endmacro + +%macro SHA512_Round 1 +%assign %%t (%1) + + ; Compute Round %%t + mov T1, f_64 ; T1 = f + mov tmp0, e_64 ; tmp = e + xor T1, g_64 ; T1 = f ^ g + RORQ tmp0, 23 ; 41 ; tmp = e ror 23 + and T1, e_64 ; T1 = (f ^ g) & e + xor tmp0, e_64 ; tmp = (e ror 23) ^ e + xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler + RORQ tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4 + xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e + mov T2, a_64 ; T2 = a + add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h + RORQ tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov tmp0, a_64 ; tmp = a + xor T2, c_64 ; T2 = a ^ c + and tmp0, c_64 ; tmp = a & c + and T2, b_64 ; T2 = (a ^ c) & b + xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov tmp0, a_64 ; tmp = a + RORQ tmp0, 5 ; 39 ; tmp = a ror 5 + xor tmp0, a_64 ; tmp = (a ror 5) ^ a + add d_64, T1 ; e(next_state) = d + T1 + RORQ tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6 + xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a + lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c) + RORQ tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +%endmacro + +%macro SHA512_2Sched_2Round_avx 1 +%assign %%t %1 + ; Compute rounds %%t-2 and %%t-1 + ; Compute message schedule QWORDS %%t and %%t+1 + + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message + ; scheduler. + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + ; They are then added to their respective SHA512 constants at + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + ; For brievity, the comments following vectored instructions only refer to + ; the first of a pair of QWORDS. + ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} + ; The computation of the message schedule and the rounds are tightly + ; stitched to take advantage of instruction-level parallelism. + ; For clarity, integer instructions (for the rounds calculation) are indented + ; by one tab. Vectored instructions (for the message scheduler) are indented + ; by two tabs. + + vmovdqa xmm4, [W_t(%%t-2)] ; XMM4 = W[t-2] + vmovdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15] + mov T1, f_64 + vpsrlq xmm0, xmm4, 61 ; XMM0 = W[t-2]>>61 + mov tmp0, e_64 + vpsrlq xmm6, xmm5, 1 ; XMM6 = W[t-15]>>1 + xor T1, g_64 + RORQ tmp0, 23 ; 41 + vpsrlq xmm1, xmm4, 19 ; XMM1 = W[t-2]>>19 + and T1, e_64 + xor tmp0, e_64 + vpxor xmm0, xmm1 ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19 + xor T1, g_64 + add T1, [WK_2(%%t)]; + vpsrlq xmm7, xmm5, 8 ; XMM7 = W[t-15]>>8 + RORQ tmp0, 4 ; 18 + vpsrlq xmm2, xmm4, 6 ; XMM2 = W[t-2]>>6 + xor tmp0, e_64 + mov T2, a_64 + add T1, h_64 + vpxor xmm6, xmm7 ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8 + RORQ tmp0, 14 ; 14 + add T1, tmp0 + vpsrlq xmm8, xmm5, 7 ; XMM8 = W[t-15]>>7 + mov tmp0, a_64 + xor T2, c_64 + vpsllq xmm3, xmm4, (64-61) ; XMM3 = W[t-2]<<3 + and tmp0, c_64 + and T2, b_64 + vpxor xmm2, xmm3 ; XMM2 = W[t-2]>>6 ^ W[t-2]<<3 + xor T2, tmp0 + mov tmp0, a_64 + vpsllq xmm9, xmm5, (64-1) ; XMM9 = W[t-15]<<63 + RORQ tmp0, 5 ; 39 + vpxor xmm8, xmm9 ; XMM8 = W[t-15]>>7 ^ W[t-15]<<63 + xor tmp0, a_64 + add d_64, T1 + RORQ tmp0, 6 ; 34 + xor tmp0, a_64 + vpxor xmm6, xmm8 ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 + lea h_64, [T1 + T2] + RORQ tmp0, 28 ; 28 + vpsllq xmm4, (64-19) ; XMM4 = W[t-2]<<25 + add h_64, tmp0 + RotateState + vpxor xmm0, xmm4 ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 + mov T1, f_64 + vpxor xmm0, xmm2 ; XMM0 = s1(W[t-2]) + mov tmp0, e_64 + xor T1, g_64 + vpaddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + W[t-16] + vmovdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7] + RORQ tmp0, 23 ; 41 + and T1, e_64 + xor tmp0, e_64 + xor T1, g_64 + vpsllq xmm5, (64-8) ; XMM5 = W[t-15]<<56 + add T1, [WK_2(%%t+1)] + vpxor xmm6, xmm5 ; XMM6 = s0(W[t-15]) + RORQ tmp0, 4 ; 18 + vpaddq xmm0, xmm6 ; XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) + xor tmp0, e_64 + vpaddq xmm0, xmm1 ; XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] + mov T2, a_64 + add T1, h_64 + RORQ tmp0, 14 ; 14 + add T1, tmp0 + vmovdqa [W_t(%%t)], xmm0 ; Store W[t] + vpaddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + vmovdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds + mov tmp0, a_64 + xor T2, c_64 + and tmp0, c_64 + and T2, b_64 + xor T2, tmp0 + mov tmp0, a_64 + RORQ tmp0, 5 ; 39 + xor tmp0, a_64 + add d_64, T1 + RORQ tmp0, 6 ; 34 + xor tmp0, a_64 + lea h_64, [T1 + T2] + RORQ tmp0, 28 ; 28 + add h_64, tmp0 + RotateState +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_avx(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks +global sha512_avx:function +sha512_avx: + cmp msglen, 0 + je .nowork + + ; Allocate Stack Space + sub rsp, frame_size + + ; Save GPRs + mov [rsp + frame.GPRSAVE + 8 * 0], rbx + mov [rsp + frame.GPRSAVE + 8 * 1], r12 + mov [rsp + frame.GPRSAVE + 8 * 2], r13 + mov [rsp + frame.GPRSAVE + 8 * 3], r14 + mov [rsp + frame.GPRSAVE + 8 * 4], r15 +%ifdef WINABI + mov [rsp + frame.GPRSAVE + 8 * 5], rsi + mov [rsp + frame.GPRSAVE + 8 * 6], rdi +%endif + ; Save XMMs +%ifdef WINABI + vmovdqa [rsp + frame.XMMSAVE + 16 * 0], xmm6 + vmovdqa [rsp + frame.XMMSAVE + 16 * 1], xmm7 + vmovdqa [rsp + frame.XMMSAVE + 16 * 2], xmm8 + vmovdqa [rsp + frame.XMMSAVE + 16 * 3], xmm9 +%endif + +.updateblock: + + ; Load state variables + mov a_64, [DIGEST(0)] + mov b_64, [DIGEST(1)] + mov c_64, [DIGEST(2)] + mov d_64, [DIGEST(3)] + mov e_64, [DIGEST(4)] + mov f_64, [DIGEST(5)] + mov g_64, [DIGEST(6)] + mov h_64, [DIGEST(7)] + + %assign t 0 + %rep 80/2 + 1 + ; (80 rounds) / (2 rounds/iteration) + (1 iteration) + ; +1 iteration because the scheduler leads hashing by 1 iteration + %if t < 2 + ; BSWAP 2 QWORDS + vmovdqa xmm1, [XMM_QWORD_BSWAP wrt rip] + vmovdqu xmm0, [MSG(t)] + vpshufb xmm0, xmm0, xmm1 ; BSWAP + vmovdqa [W_t(t)], xmm0 ; Store Scheduled Pair + vpaddq xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t] + vmovdqa [WK_2(t)], xmm0 ; Store into WK for rounds + %elif t < 16 + ; BSWAP 2 QWORDS, Compute 2 Rounds + vmovdqu xmm0, [MSG(t)] + vpshufb xmm0, xmm0, xmm1 ; BSWAP + SHA512_Round t - 2 ; Round t-2 + vmovdqa [W_t(t)], xmm0 ; Store Scheduled Pair + vpaddq xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t] + SHA512_Round t - 1 ; Round t-1 + vmovdqa [WK_2(t)], xmm0 ; W[t]+K[t] into WK + %elif t < 79 + ; Schedule 2 QWORDS; Compute 2 Rounds + SHA512_2Sched_2Round_avx t + %else + ; Compute 2 Rounds + SHA512_Round t - 2 + SHA512_Round t - 1 + %endif + %assign t t+2 + %endrep + + ; Update digest + add [DIGEST(0)], a_64 + add [DIGEST(1)], b_64 + add [DIGEST(2)], c_64 + add [DIGEST(3)], d_64 + add [DIGEST(4)], e_64 + add [DIGEST(5)], f_64 + add [DIGEST(6)], g_64 + add [DIGEST(7)], h_64 + + ; Advance to next message block + add msg, 16*8 + dec msglen + jnz .updateblock + + ; Restore XMMs +%ifdef WINABI + vmovdqa xmm6, [rsp + frame.XMMSAVE + 16 * 0] + vmovdqa xmm7, [rsp + frame.XMMSAVE + 16 * 1] + vmovdqa xmm8, [rsp + frame.XMMSAVE + 16 * 2] + vmovdqa xmm9, [rsp + frame.XMMSAVE + 16 * 3] +%endif + ; Restore GPRs + mov rbx, [rsp + frame.GPRSAVE + 8 * 0] + mov r12, [rsp + frame.GPRSAVE + 8 * 1] + mov r13, [rsp + frame.GPRSAVE + 8 * 2] + mov r14, [rsp + frame.GPRSAVE + 8 * 3] + mov r15, [rsp + frame.GPRSAVE + 8 * 4] +%ifdef WINABI + mov rsi, [rsp + frame.GPRSAVE + 8 * 5] + mov rdi, [rsp + frame.GPRSAVE + 8 * 6] +%endif + ; Restore Stack Pointer + add rsp, frame_size + +.nowork: + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data + +section .data + +ALIGN 16 + +; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + ddq 0x08090a0b0c0d0e0f0001020304050607 + +; K[t] used in SHA512 hashing +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha512_avx1_x86.asm b/src/Crypto/sha512_avx1_x86.asm new file mode 100644 index 00000000..31c8bd0d --- /dev/null +++ b/src/Crypto/sha512_avx1_x86.asm @@ -0,0 +1,10 @@ + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha512_avx2_x64.asm b/src/Crypto/sha512_avx2_x64.asm new file mode 100644 index 00000000..1ba08665 --- /dev/null +++ b/src/Crypto/sha512_avx2_x64.asm @@ -0,0 +1,804 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: yasm -f x64 -D WINABI sha512_rorx.asm +; Linux: yasm -f elf64 sha512_rorx.asm +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +BITS 64 +section .text + +; Virtual Registers +%define Y_0 ymm4 +%define Y_1 ymm5 +%define Y_2 ymm6 +%define Y_3 ymm7 + +%define YTMP0 ymm0 +%define YTMP1 ymm1 +%define YTMP2 ymm2 +%define YTMP3 ymm3 +%define YTMP4 ymm8 +%define XFER YTMP0 + +%define BYTE_FLIP_MASK ymm9 + +%ifdef WINABI + %define INP rcx ; 1st arg + %define CTX rdx ; 2nd arg + %define NUM_BLKS r8 ; 3rd arg + %define c rdi + %define d rsi + %define e r8 + %define y3 rcx +%else + %define INP rdi ; 1st arg + %define CTX rsi ; 2nd arg + %define NUM_BLKS rdx ; 3rd arg + %define c rcx + %define d r8 + %define e rdx + %define y3 rdi +%endif + +%define TBL rbp + +%define a rax +%define b rbx + +%define f r9 +%define g r10 +%define h r11 +%define old_h r11 + +%define T1 r12 +%define y0 r13 +%define y1 r14 +%define y2 r15 + +%define y4 r12 + +; Local variables (stack frame) +struc frame + .XFER: resq 4 + .SRND: resq 1 + .INP: resq 1 + .INPEND: resq 1 + .RSPSAVE: resq 1 + +%ifdef WINABI + .XMMSAVE: resdq 4 + .GPRSAVE: resq 8 +%else + .GPRSAVE: resq 6 +%endif +endstruc + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +; addm [mem], reg +; Add reg to mem using reg-mem add and store +%macro addm 2 + add %2, %1 + mov %1, %2 +%endm + + +; COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask +; Load ymm with mem and byte swap each dword +%macro COPY_YMM_AND_BSWAP 3 + VMOVDQ %1, %2 + vpshufb %1, %1 ,%3 +%endmacro +; rotate_Ys +; Rotate values of symbols Y0...Y3 +%macro rotate_Ys 0 + %xdefine %%Y_ Y_0 + %xdefine Y_0 Y_1 + %xdefine Y_1 Y_2 + %xdefine Y_2 Y_3 + %xdefine Y_3 %%Y_ +%endm + +; RotateState +%macro RotateState 0 + ; Rotate symbles a..h right + %xdefine old_h h + %xdefine %%TMP_ h + %xdefine h g + %xdefine g f + %xdefine f e + %xdefine e d + %xdefine d c + %xdefine c b + %xdefine b a + %xdefine a %%TMP_ +%endm + +; %macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL +; YDST = {YSRC1, YSRC2} >> RVAL*8 +%macro MY_VPALIGNR 4 +%define %%YDST %1 +%define %%YSRC1 %2 +%define %%YSRC2 %3 +%define %%RVAL %4 + vperm2f128 %%YDST, %%YSRC1, %%YSRC2, 0x3 ; YDST = {YS1_LO, YS2_HI} + vpalignr %%YDST, %%YDST, %%YSRC2, %%RVAL ; YDST = {YDS1, YS2} >> RVAL*8 +%endm + +%macro FOUR_ROUNDS_AND_SCHED 0 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Extract w[t-7] + MY_VPALIGNR YTMP0, Y_3, Y_2, 8 ; YTMP0 = W[-7] + ; Calculate w[t-16] + w[t-7] + vpaddq YTMP0, YTMP0, Y_0 ; YTMP0 = W[-7] + W[-16] + ; Extract w[t-15] + MY_VPALIGNR YTMP1, Y_1, Y_0, 8 ; YTMP1 = W[-15] + + ; Calculate sigma0 + + ; Calculate w[t-15] ror 1 + vpsrlq YTMP2, YTMP1, 1 + vpsllq YTMP3, YTMP1, (64-1) + vpor YTMP3, YTMP3, YTMP2 ; YTMP3 = W[-15] ror 1 + ; Calculate w[t-15] shr 7 + vpsrlq YTMP4, YTMP1, 7 ; YTMP4 = W[-15] >> 7 + + mov y3, a ; y3 = a ; MAJA + rorx y0, e, 41 ; y0 = e >> 41 ; S1A + rorx y1, e, 18 ; y1 = e >> 18 ; S1B + + add h, [rsp+frame.XFER+0*8] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + mov y2, f ; y2 = f ; CH + rorx T1, a, 34 ; T1 = a >> 34 ; S0B + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1 + xor y2, g ; y2 = f^g ; CH + rorx y1, e, 14 ; y1 = (e >> 14) ; S1 + + and y2, e ; y2 = (f^g)&e ; CH + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 + rorx y1, a, 39 ; y1 = a >> 39 ; S0A + add d, h ; d = k + w + h + d ; -- + + and y3, b ; y3 = (a|c)&b ; MAJA + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0 + rorx T1, a, 28 ; T1 = (a >> 28) ; S0 + + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 + mov T1, a ; T1 = a ; MAJB + and T1, c ; T1 = a&c ; MAJB + + add y2, y0 ; y2 = S1 + CH ; -- + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + add h, y3 ; h = t1 + S0 + MAJ ; -- + +RotateState + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;; + + ; Calculate w[t-15] ror 8 + vpsrlq YTMP2, YTMP1, 8 + vpsllq YTMP1, YTMP1, (64-8) + vpor YTMP1, YTMP1, YTMP2 ; YTMP1 = W[-15] ror 8 + ; XOR the three components + vpxor YTMP3, YTMP3, YTMP4 ; YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 + vpxor YTMP1, YTMP3, YTMP1 ; YTMP1 = s0 + + + ; Add three components, w[t-16], w[t-7] and sigma0 + vpaddq YTMP0, YTMP0, YTMP1 ; YTMP0 = W[-16] + W[-7] + s0 + ; Move to appropriate lanes for calculating w[16] and w[17] + vperm2f128 Y_0, YTMP0, YTMP0, 0x0 ; Y_0 = W[-16] + W[-7] + s0 {BABA} + ; Move to appropriate lanes for calculating w[18] and w[19] + vpand YTMP0, YTMP0, [MASK_YMM_LO wrt rip] ; YTMP0 = W[-16] + W[-7] + s0 {DC00} + + ; Calculate w[16] and w[17] in both 128 bit lanes + + ; Calculate sigma1 for w[16] and w[17] on both 128 bit lanes + vperm2f128 YTMP2, Y_3, Y_3, 0x11 ; YTMP2 = W[-2] {BABA} + vpsrlq YTMP4, YTMP2, 6 ; YTMP4 = W[-2] >> 6 {BABA} + + + mov y3, a ; y3 = a ; MAJA + rorx y0, e, 41 ; y0 = e >> 41 ; S1A + rorx y1, e, 18 ; y1 = e >> 18 ; S1B + add h, [rsp+frame.XFER+1*8] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + + mov y2, f ; y2 = f ; CH + rorx T1, a, 34 ; T1 = a >> 34 ; S0B + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1 + xor y2, g ; y2 = f^g ; CH + + + rorx y1, e, 14 ; y1 = (e >> 14) ; S1 + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 + rorx y1, a, 39 ; y1 = a >> 39 ; S0A + and y2, e ; y2 = (f^g)&e ; CH + add d, h ; d = k + w + h + d ; -- + + and y3, b ; y3 = (a|c)&b ; MAJA + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0 + + rorx T1, a, 28 ; T1 = (a >> 28) ; S0 + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 + mov T1, a ; T1 = a ; MAJB + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + add h, y3 ; h = t1 + S0 + MAJ ; -- + +RotateState + + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;; + + + vpsrlq YTMP3, YTMP2, 19 ; YTMP3 = W[-2] >> 19 {BABA} + vpsllq YTMP1, YTMP2, (64-19) ; YTMP1 = W[-2] << 19 {BABA} + vpor YTMP3, YTMP3, YTMP1 ; YTMP3 = W[-2] ror 19 {BABA} + vpxor YTMP4, YTMP4, YTMP3 ; YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} + vpsrlq YTMP3, YTMP2, 61 ; YTMP3 = W[-2] >> 61 {BABA} + vpsllq YTMP1, YTMP2, (64-61) ; YTMP1 = W[-2] << 61 {BABA} + vpor YTMP3, YTMP3, YTMP1 ; YTMP3 = W[-2] ror 61 {BABA} + vpxor YTMP4, YTMP4, YTMP3 ; YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} + + ; Add sigma1 to the other compunents to get w[16] and w[17] + vpaddq Y_0, Y_0, YTMP4 ; Y_0 = {W[1], W[0], W[1], W[0]} + + ; Calculate sigma1 for w[18] and w[19] for upper 128 bit lane + vpsrlq YTMP4, Y_0, 6 ; YTMP4 = W[-2] >> 6 {DC--} + + mov y3, a ; y3 = a ; MAJA + rorx y0, e, 41 ; y0 = e >> 41 ; S1A + add h, [rsp+frame.XFER+2*8] ; h = k + w + h ; -- + + rorx y1, e, 18 ; y1 = e >> 18 ; S1B + or y3, c ; y3 = a|c ; MAJA + mov y2, f ; y2 = f ; CH + xor y2, g ; y2 = f^g ; CH + + rorx T1, a, 34 ; T1 = a >> 34 ; S0B + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + + rorx y1, e, 14 ; y1 = (e >> 14) ; S1 + add d, h ; d = k + w + h + d ; -- + and y3, b ; y3 = (a|c)&b ; MAJA + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 + rorx y1, a, 39 ; y1 = a >> 39 ; S0A + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0 + rorx T1, a, 28 ; T1 = (a >> 28) ; S0 + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 + mov T1, a ; T1 = a ; MAJB + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + + add h, y3 ; h = t1 + S0 + MAJ ; -- + +RotateState + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;; + + vpsrlq YTMP3, Y_0, 19 ; YTMP3 = W[-2] >> 19 {DC--} + vpsllq YTMP1, Y_0, (64-19) ; YTMP1 = W[-2] << 19 {DC--} + vpor YTMP3, YTMP3, YTMP1 ; YTMP3 = W[-2] ror 19 {DC--} + vpxor YTMP4, YTMP4, YTMP3 ; YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} + vpsrlq YTMP3, Y_0, 61 ; YTMP3 = W[-2] >> 61 {DC--} + vpsllq YTMP1, Y_0, (64-61) ; YTMP1 = W[-2] << 61 {DC--} + vpor YTMP3, YTMP3, YTMP1 ; YTMP3 = W[-2] ror 61 {DC--} + vpxor YTMP4, YTMP4, YTMP3 ; YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} + + ; Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] + vpaddq YTMP2, YTMP0, YTMP4 ; YTMP2 = {W[3], W[2], --, --} + + ; Form w[19, w[18], w17], w[16] + vpblendd Y_0, Y_0, YTMP2, 0xF0 ; Y_0 = {W[3], W[2], W[1], W[0]} +; vperm2f128 Y_0, Y_0, YTMP2, 0x30 + + mov y3, a ; y3 = a ; MAJA + rorx y0, e, 41 ; y0 = e >> 41 ; S1A + rorx y1, e, 18 ; y1 = e >> 18 ; S1B + add h, [rsp+frame.XFER+3*8] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + + mov y2, f ; y2 = f ; CH + rorx T1, a, 34 ; T1 = a >> 34 ; S0B + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1 + xor y2, g ; y2 = f^g ; CH + + + rorx y1, e, 14 ; y1 = (e >> 14) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + add d, h ; d = k + w + h + d ; -- + and y3, b ; y3 = (a|c)&b ; MAJA + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + + rorx y1, a, 39 ; y1 = a >> 39 ; S0A + add y2, y0 ; y2 = S1 + CH ; -- + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0 + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + rorx T1, a, 28 ; T1 = (a >> 28) ; S0 + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 + mov T1, a ; T1 = a ; MAJB + and T1, c ; T1 = a&c ; MAJB + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + + add h, y1 ; h = k + w + h + S0 ; -- + add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + add h, y3 ; h = t1 + S0 + MAJ ; -- + +RotateState + +rotate_Ys +%endm + +%macro DO_4ROUNDS 0 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov y2, f ; y2 = f ; CH + rorx y0, e, 41 ; y0 = e >> 41 ; S1A + rorx y1, e, 18 ; y1 = e >> 18 ; S1B + xor y2, g ; y2 = f^g ; CH + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1 + rorx y1, e, 14 ; y1 = (e >> 14) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 + rorx T1, a, 34 ; T1 = a >> 34 ; S0B + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + rorx y1, a, 39 ; y1 = a >> 39 ; S0A + mov y3, a ; y3 = a ; MAJA + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0 + rorx T1, a, 28 ; T1 = (a >> 28) ; S0 + add h, [rsp + frame.XFER + 8*0] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 + mov T1, a ; T1 = a ; MAJB + and y3, b ; y3 = (a|c)&b ; MAJA + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + + add d, h ; d = k + w + h + d ; -- + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + + ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + + ;add h, y3 ; h = t1 + S0 + MAJ ; -- + + RotateState + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + mov y2, f ; y2 = f ; CH + rorx y0, e, 41 ; y0 = e >> 41 ; S1A + rorx y1, e, 18 ; y1 = e >> 18 ; S1B + xor y2, g ; y2 = f^g ; CH + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1 + rorx y1, e, 14 ; y1 = (e >> 14) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + add old_h, y3 ; h = t1 + S0 + MAJ ; -- + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 + rorx T1, a, 34 ; T1 = a >> 34 ; S0B + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + rorx y1, a, 39 ; y1 = a >> 39 ; S0A + mov y3, a ; y3 = a ; MAJA + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0 + rorx T1, a, 28 ; T1 = (a >> 28) ; S0 + add h, [rsp + frame.XFER + 8*1] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 + mov T1, a ; T1 = a ; MAJB + and y3, b ; y3 = (a|c)&b ; MAJA + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + + add d, h ; d = k + w + h + d ; -- + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + + ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + + ;add h, y3 ; h = t1 + S0 + MAJ ; -- + + RotateState + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + mov y2, f ; y2 = f ; CH + rorx y0, e, 41 ; y0 = e >> 41 ; S1A + rorx y1, e, 18 ; y1 = e >> 18 ; S1B + xor y2, g ; y2 = f^g ; CH + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1 + rorx y1, e, 14 ; y1 = (e >> 14) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + add old_h, y3 ; h = t1 + S0 + MAJ ; -- + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 + rorx T1, a, 34 ; T1 = a >> 34 ; S0B + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + rorx y1, a, 39 ; y1 = a >> 39 ; S0A + mov y3, a ; y3 = a ; MAJA + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0 + rorx T1, a, 28 ; T1 = (a >> 28) ; S0 + add h, [rsp + frame.XFER + 8*2] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 + mov T1, a ; T1 = a ; MAJB + and y3, b ; y3 = (a|c)&b ; MAJA + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + + add d, h ; d = k + w + h + d ; -- + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + + ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + + ;add h, y3 ; h = t1 + S0 + MAJ ; -- + + RotateState + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + mov y2, f ; y2 = f ; CH + rorx y0, e, 41 ; y0 = e >> 41 ; S1A + rorx y1, e, 18 ; y1 = e >> 18 ; S1B + xor y2, g ; y2 = f^g ; CH + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1 + rorx y1, e, 14 ; y1 = (e >> 14) ; S1 + and y2, e ; y2 = (f^g)&e ; CH + add old_h, y3 ; h = t1 + S0 + MAJ ; -- + + xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 + rorx T1, a, 34 ; T1 = a >> 34 ; S0B + xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH + rorx y1, a, 39 ; y1 = a >> 39 ; S0A + mov y3, a ; y3 = a ; MAJA + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0 + rorx T1, a, 28 ; T1 = (a >> 28) ; S0 + add h, [rsp + frame.XFER + 8*3] ; h = k + w + h ; -- + or y3, c ; y3 = a|c ; MAJA + + xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 + mov T1, a ; T1 = a ; MAJB + and y3, b ; y3 = (a|c)&b ; MAJA + and T1, c ; T1 = a&c ; MAJB + add y2, y0 ; y2 = S1 + CH ; -- + + + add d, h ; d = k + w + h + d ; -- + or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ + add h, y1 ; h = k + w + h + S0 ; -- + + add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; -- + + + add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- + + add h, y3 ; h = t1 + S0 + MAJ ; -- + + RotateState + +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_rorx(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks +global sha512_rorx:function +global _sha512_rorx:function +sha512_rorx: +_sha512_rorx: + + ; Allocate Stack Space + mov rax, rsp + sub rsp, frame_size + and rsp, ~(0x20 - 1) + mov [rsp + frame.RSPSAVE], rax + + ; Save GPRs + mov [rsp + frame.GPRSAVE + 8 * 0], rbp + mov [rsp + frame.GPRSAVE + 8 * 1], rbx + mov [rsp + frame.GPRSAVE + 8 * 2], r12 + mov [rsp + frame.GPRSAVE + 8 * 3], r13 + mov [rsp + frame.GPRSAVE + 8 * 4], r14 + mov [rsp + frame.GPRSAVE + 8 * 5], r15 +%ifdef WINABI + mov [rsp + frame.GPRSAVE + 8 * 6], rsi + mov [rsp + frame.GPRSAVE + 8 * 7], rdi +%endif + +%ifdef WINABI + vmovdqa [rsp + frame.XMMSAVE + 0*16], xmm6 + vmovdqa [rsp + frame.XMMSAVE + 1*16], xmm7 + vmovdqa [rsp + frame.XMMSAVE + 2*16], xmm8 + vmovdqa [rsp + frame.XMMSAVE + 3*16], xmm9 +%endif + + vpblendd xmm0, xmm0, xmm1, 0xf0 + vpblendd ymm0, ymm0, ymm1, 0xf0 + + shl NUM_BLKS, 7 ; convert to bytes + jz done_hash + add NUM_BLKS, INP ; pointer to end of data + mov [rsp + frame.INPEND], NUM_BLKS + + ;; load initial digest + mov a,[8*0 + CTX] + mov b,[8*1 + CTX] + mov c,[8*2 + CTX] + mov d,[8*3 + CTX] + mov e,[8*4 + CTX] + mov f,[8*5 + CTX] + mov g,[8*6 + CTX] + mov h,[8*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip] + +loop0: + lea TBL,[K512 wrt rip] + + ;; byte swap first 16 dwords + COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK + + mov [rsp + frame.INP], INP + + ;; schedule 64 input dwords, by doing 12 rounds of 4 each + mov qword[rsp + frame.SRND],4 + +align 16 +loop1: + vpaddq XFER, Y_0, [TBL + 0*32] + vmovdqa [rsp + frame.XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddq XFER, Y_0, [TBL + 1*32] + vmovdqa [rsp + frame.XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddq XFER, Y_0, [TBL + 2*32] + vmovdqa [rsp + frame.XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddq XFER, Y_0, [TBL + 3*32] + vmovdqa [rsp + frame.XFER], XFER + add TBL, 4*32 + FOUR_ROUNDS_AND_SCHED + + sub qword[rsp + frame.SRND], 1 + jne loop1 + + mov qword[rsp + frame.SRND], 2 +loop2: + vpaddq XFER, Y_0, [TBL + 0*32] + vmovdqa [rsp + frame.XFER], XFER + DO_4ROUNDS + vpaddq XFER, Y_1, [TBL + 1*32] + vmovdqa [rsp + frame.XFER], XFER + add TBL, 2*32 + DO_4ROUNDS + + vmovdqa Y_0, Y_2 + vmovdqa Y_1, Y_3 + + sub qword[rsp + frame.SRND], 1 + jne loop2 + + addm [8*0 + CTX],a + addm [8*1 + CTX],b + addm [8*2 + CTX],c + addm [8*3 + CTX],d + addm [8*4 + CTX],e + addm [8*5 + CTX],f + addm [8*6 + CTX],g + addm [8*7 + CTX],h + + mov INP, [rsp + frame.INP] + add INP, 128 + cmp INP, [rsp + frame.INPEND] + jne loop0 + + done_hash: +%ifdef WINABI + vmovdqa xmm6, [rsp + frame.XMMSAVE + 0*16] + vmovdqa xmm7, [rsp + frame.XMMSAVE + 1*16] + vmovdqa xmm8, [rsp + frame.XMMSAVE + 2*16] + vmovdqa xmm9, [rsp + frame.XMMSAVE + 3*16] +%endif + +; Restore GPRs + mov rbp, [rsp + frame.GPRSAVE + 8 * 0] + mov rbx, [rsp + frame.GPRSAVE + 8 * 1] + mov r12, [rsp + frame.GPRSAVE + 8 * 2] + mov r13, [rsp + frame.GPRSAVE + 8 * 3] + mov r14, [rsp + frame.GPRSAVE + 8 * 4] + mov r15, [rsp + frame.GPRSAVE + 8 * 5] +%ifdef WINABI + mov rsi, [rsp + frame.GPRSAVE + 8 * 6] + mov rdi, [rsp + frame.GPRSAVE + 8 * 7] +%endif + ; Restore Stack Pointer + mov rsp, [rsp + frame.RSPSAVE] + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data + +section .data + +align 64 +; K[t] used in SHA512 hashing +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +align 32 + +; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +PSHUFFLE_BYTE_FLIP_MASK: ddq 0x08090a0b0c0d0e0f0001020304050607 + ddq 0x18191a1b1c1d1e1f1011121314151617 + +MASK_YMM_LO: ddq 0x00000000000000000000000000000000 + ddq 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha512_avx2_x86.asm b/src/Crypto/sha512_avx2_x86.asm new file mode 100644 index 00000000..31c8bd0d --- /dev/null +++ b/src/Crypto/sha512_avx2_x86.asm @@ -0,0 +1,10 @@ + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha512_sse4_x64.asm b/src/Crypto/sha512_sse4_x64.asm new file mode 100644 index 00000000..d4a99875 --- /dev/null +++ b/src/Crypto/sha512_sse4_x64.asm @@ -0,0 +1,416 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Example YASM command lines: +; Windows: yasm -f x64 -D WINABI sha512_sse4.asm +; Linux: yasm -f elf64 sha512_sse4.asm +; + +# Modified by kerukuro for use in cppcrypto. + +BITS 64 +section .text + +; Virtual Registers +%ifdef WINABI + %define msg rcx ; ARG1 + %define digest rdx ; ARG2 + %define msglen r8 ; ARG3 + %define T1 rsi + %define T2 rdi +%else + %define msg rdi ; ARG1 + %define digest rsi ; ARG2 + %define msglen rdx ; ARG3 + %define T1 rcx + %define T2 r8 +%endif +%define a_64 r9 +%define b_64 r10 +%define c_64 r11 +%define d_64 r12 +%define e_64 r13 +%define f_64 r14 +%define g_64 r15 +%define h_64 rbx +%define tmp0 rax + +; Local variables (stack frame) +; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP +struc frame + .W: resq 80 ; Message Schedule + .WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1] + +%ifdef WINABI + .GPRSAVE: resq 7 +%else + .GPRSAVE: resq 5 +%endif +endstruc + +; Useful QWORD "arrays" for simpler memory references +%define MSG(i) msg + 8*(i) ; Input message (arg1) +%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2) +%define K_t(i) K512 + 8*(i) wrt rip ; SHA Constants (static mem) +%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame) +%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame) +; MSG, DIGEST, K_t, W_t are arrays +; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even + +%macro RotateState 0 + ; Rotate symbles a..h right + %xdefine %%TMP h_64 + %xdefine h_64 g_64 + %xdefine g_64 f_64 + %xdefine f_64 e_64 + %xdefine e_64 d_64 + %xdefine d_64 c_64 + %xdefine c_64 b_64 + %xdefine b_64 a_64 + %xdefine a_64 %%TMP +%endmacro + +%macro SHA512_Round 1 +%assign %%t (%1) + + ; Compute Round %%t + mov T1, f_64 ; T1 = f + mov tmp0, e_64 ; tmp = e + xor T1, g_64 ; T1 = f ^ g + ror tmp0, 23 ; 41 ; tmp = e ror 23 + and T1, e_64 ; T1 = (f ^ g) & e + xor tmp0, e_64 ; tmp = (e ror 23) ^ e + xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler + ror tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4 + xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e + mov T2, a_64 ; T2 = a + add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h + ror tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov tmp0, a_64 ; tmp = a + xor T2, c_64 ; T2 = a ^ c + and tmp0, c_64 ; tmp = a & c + and T2, b_64 ; T2 = (a ^ c) & b + xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov tmp0, a_64 ; tmp = a + ror tmp0, 5 ; 39 ; tmp = a ror 5 + xor tmp0, a_64 ; tmp = (a ror 5) ^ a + add d_64, T1 ; e(next_state) = d + T1 + ror tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6 + xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a + lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c) + ror tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +%endmacro + +%macro SHA512_2Sched_2Round_sse 1 +%assign %%t (%1) + + ; Compute rounds %%t-2 and %%t-1 + ; Compute message schedule QWORDS %%t and %%t+1 + + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message + ; scheduler. + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + ; They are then added to their respective SHA512 constants at + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + ; For brievity, the comments following vectored instructions only refer to + ; the first of a pair of QWORDS. + ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} + ; The computation of the message schedule and the rounds are tightly + ; stitched to take advantage of instruction-level parallelism. + ; For clarity, integer instructions (for the rounds calculation) are indented + ; by one tab. Vectored instructions (for the message scheduler) are indented + ; by two tabs. + + mov T1, f_64 + movdqa xmm2, [W_t(%%t-2)] ; XMM2 = W[t-2] + xor T1, g_64 + and T1, e_64 + movdqa xmm0, xmm2 ; XMM0 = W[t-2] + xor T1, g_64 + add T1, [WK_2(%%t)] + movdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15] + mov tmp0, e_64 + ror tmp0, 23 ; 41 + movdqa xmm3, xmm5 ; XMM3 = W[t-15] + xor tmp0, e_64 + ror tmp0, 4 ; 18 + psrlq xmm0, 61 - 19 ; XMM0 = W[t-2] >> 42 + xor tmp0, e_64 + ror tmp0, 14 ; 14 + psrlq xmm3, (8 - 7) ; XMM3 = W[t-15] >> 1 + add T1, tmp0 + add T1, h_64 + pxor xmm0, xmm2 ; XMM0 = (W[t-2] >> 42) ^ W[t-2] + mov T2, a_64 + xor T2, c_64 + pxor xmm3, xmm5 ; XMM3 = (W[t-15] >> 1) ^ W[t-15] + and T2, b_64 + mov tmp0, a_64 + psrlq xmm0, 19 - 6 ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13 + and tmp0, c_64 + xor T2, tmp0 + psrlq xmm3, (7 - 1) ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6 + mov tmp0, a_64 + ror tmp0, 5 ; 39 + pxor xmm0, xmm2 ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] + xor tmp0, a_64 + ror tmp0, 6 ; 34 + pxor xmm3, xmm5 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] + xor tmp0, a_64 + ror tmp0, 28 ; 28 + psrlq xmm0, 6 ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 + add T2, tmp0 + add d_64, T1 + psrlq xmm3, 1 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 + lea h_64, [T1 + T2] + RotateState + movdqa xmm1, xmm2 ; XMM1 = W[t-2] + mov T1, f_64 + xor T1, g_64 + movdqa xmm4, xmm5 ; XMM4 = W[t-15] + and T1, e_64 + xor T1, g_64 + psllq xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42 + add T1, [WK_2(%%t+1)] + mov tmp0, e_64 + psllq xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7 + ror tmp0, 23 ; 41 + xor tmp0, e_64 + pxor xmm1, xmm2 ; XMM1 = (W[t-2] << 42)^W[t-2] + ror tmp0, 4 ; 18 + xor tmp0, e_64 + pxor xmm4, xmm5 ; XMM4 = (W[t-15]<<7)^W[t-15] + ror tmp0, 14 ; 14 + add T1, tmp0 + psllq xmm1, (64 - 61) ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3 + add T1, h_64 + mov T2, a_64 + psllq xmm4, (64 - 8) ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56 + xor T2, c_64 + and T2, b_64 + pxor xmm0, xmm1 ; XMM0 = s1(W[t-2]) + mov tmp0, a_64 + and tmp0, c_64 + movdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7] + xor T2, tmp0 + pxor xmm3, xmm4 ; XMM3 = s0(W[t-15]) + mov tmp0, a_64 + paddq xmm0, xmm3 ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + ror tmp0, 5 ; 39 + paddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] + xor tmp0, a_64 + paddq xmm0, xmm1 ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] + ror tmp0, 6 ; 34 + movdqa [W_t(%%t)], xmm0 ; Store scheduled qwords + xor tmp0, a_64 + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + ror tmp0, 28 ; 28 + movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds + add T2, tmp0 + add d_64, T1 + lea h_64, [T1 + T2] + RotateState +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_sse4(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks. +global sha512_sse4:function +global _sha512_sse4:function +sha512_sse4: +_sha512_sse4: + cmp msglen, 0 + je .nowork + + ; Allocate Stack Space + sub rsp, frame_size + + ; Save GPRs + mov [rsp + frame.GPRSAVE + 8 * 0], rbx + mov [rsp + frame.GPRSAVE + 8 * 1], r12 + mov [rsp + frame.GPRSAVE + 8 * 2], r13 + mov [rsp + frame.GPRSAVE + 8 * 3], r14 + mov [rsp + frame.GPRSAVE + 8 * 4], r15 +%ifdef WINABI + mov [rsp + frame.GPRSAVE + 8 * 5], rsi + mov [rsp + frame.GPRSAVE + 8 * 6], rdi +%endif + +.updateblock: + + ; Load state variables + mov a_64, [DIGEST(0)] + mov b_64, [DIGEST(1)] + mov c_64, [DIGEST(2)] + mov d_64, [DIGEST(3)] + mov e_64, [DIGEST(4)] + mov f_64, [DIGEST(5)] + mov g_64, [DIGEST(6)] + mov h_64, [DIGEST(7)] + + %assign t 0 + %rep 80/2 + 1 + ; (80 rounds) / (2 rounds/iteration) + (1 iteration) + ; +1 iteration because the scheduler leads hashing by 1 iteration + %if t < 2 + ; BSWAP 2 QWORDS + movdqa xmm1, [XMM_QWORD_BSWAP wrt rip] + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 ; BSWAP + movdqa [W_t(t)], xmm0 ; Store Scheduled Pair + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + movdqa [WK_2(t)], xmm0 ; Store into WK for rounds + %elif t < 16 + ; BSWAP 2 QWORDS; Compute 2 Rounds + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 ; BSWAP + SHA512_Round t - 2 ; Round t-2 + movdqa [W_t(t)], xmm0 ; Store Scheduled Pair + paddq xmm0, [K_t(t)] ; Compute W[t]+K[t] + SHA512_Round t - 1 ; Round t-1 + movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK + %elif t < 79 + ; Schedule 2 QWORDS; Compute 2 Rounds + SHA512_2Sched_2Round_sse t + %else + ; Compute 2 Rounds + SHA512_Round t - 2 + SHA512_Round t - 1 + %endif + %assign t t+2 + %endrep + + ; Update digest + add [DIGEST(0)], a_64 + add [DIGEST(1)], b_64 + add [DIGEST(2)], c_64 + add [DIGEST(3)], d_64 + add [DIGEST(4)], e_64 + add [DIGEST(5)], f_64 + add [DIGEST(6)], g_64 + add [DIGEST(7)], h_64 + + ; Advance to next message block + add msg, 16*8 + dec msglen + jnz .updateblock + + ; Restore GPRs + mov rbx, [rsp + frame.GPRSAVE + 8 * 0] + mov r12, [rsp + frame.GPRSAVE + 8 * 1] + mov r13, [rsp + frame.GPRSAVE + 8 * 2] + mov r14, [rsp + frame.GPRSAVE + 8 * 3] + mov r15, [rsp + frame.GPRSAVE + 8 * 4] +%ifdef WINABI + mov rsi, [rsp + frame.GPRSAVE + 8 * 5] + mov rdi, [rsp + frame.GPRSAVE + 8 * 6] +%endif + ; Restore Stack Pointer + add rsp, frame_size + +.nowork: + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data + +section .data + +ALIGN 16 + +; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + ddq 0x08090a0b0c0d0e0f0001020304050607 + +; K[t] used in SHA512 hashing +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Crypto/sha512_sse4_x86.asm b/src/Crypto/sha512_sse4_x86.asm new file mode 100644 index 00000000..31c8bd0d --- /dev/null +++ b/src/Crypto/sha512_sse4_x86.asm @@ -0,0 +1,10 @@ + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/Driver/DriveFilter.c b/src/Driver/DriveFilter.c index 8195fe35..d46bd92e 100644 --- a/src/Driver/DriveFilter.c +++ b/src/Driver/DriveFilter.c @@ -330,7 +330,7 @@ static void ComputeBootLoaderFingerprint(PDEVICE_OBJECT LowerDeviceObject, byte* #if !defined (_WIN64) KFLOATING_SAVE floatingPointState; NTSTATUS saveStatus = STATUS_SUCCESS; - if (HasISSE()) + if (HasISSE()|| (HasSSE2() && HasMMX())) saveStatus = KeSaveFloatingPointState (&floatingPointState); #endif WHIRLPOOL_add (ioBuffer, TC_BOOT_SECTOR_PIM_VALUE_OFFSET, &whirlpool); @@ -368,7 +368,7 @@ static void ComputeBootLoaderFingerprint(PDEVICE_OBJECT LowerDeviceObject, byte* } #if !defined (_WIN64) - if (NT_SUCCESS (saveStatus) && HasISSE()) + if (NT_SUCCESS (saveStatus) && (HasISSE() || (HasSSE2() && HasMMX()))) KeRestoreFloatingPointState (&floatingPointState); #endif } diff --git a/src/Driver/Driver.vcxproj b/src/Driver/Driver.vcxproj index a108f426..381d2083 100644 --- a/src/Driver/Driver.vcxproj +++ b/src/Driver/Driver.vcxproj @@ -225,6 +225,33 @@ BuildDriver.cmd -rebuild -debug -x64 "$(SolutionDir)\Common" "$(SolutionDir)\Cry + + Document + + + Document + + + Document + + + Document + + + Document + + + Document + + + Document + + + Document + + + Document + diff --git a/src/Driver/Driver.vcxproj.filters b/src/Driver/Driver.vcxproj.filters index 5a44984d..3622c7a8 100644 --- a/src/Driver/Driver.vcxproj.filters +++ b/src/Driver/Driver.vcxproj.filters @@ -152,6 +152,33 @@ Source Files\Crypto + + Source Files\Crypto + + + Source Files\Crypto + + + Source Files\Crypto + + + Source Files\Crypto + + + Source Files\Crypto + + + Source Files\Crypto + + + Source Files\Crypto + + + Source Files\Crypto + + + Source Files\Crypto + diff --git a/src/Volume/Volume.make b/src/Volume/Volume.make index 0ecc7f42..e179c563 100644 --- a/src/Volume/Volume.make +++ b/src/Volume/Volume.make @@ -36,15 +36,32 @@ ifeq "$(PLATFORM)" "MacOSX" OBJSEX += ../Crypto/Twofish_asm.oo OBJSEX += ../Crypto/Camellia_asm.oo OBJSEX += ../Crypto/Camellia_aesni_asm.oo + OBJS += ../Crypto/sha256-nayuki.oo + OBJS += ../Crypto/sha512-nayuki.oo + OBJS += ../Crypto/sha256_avx1.oo + OBJS += ../Crypto/sha256_avx2.oo + OBJS += ../Crypto/sha256_sse4.oo + OBJS += ../Crypto/sha512_avx1.oo + OBJS += ../Crypto/sha512_avx2.oo + OBJS += ../Crypto/sha512_sse4.oo else ifeq "$(CPU_ARCH)" "x86" OBJS += ../Crypto/Aes_x86.o OBJS += ../Crypto/Aes_hw_cpu.o + OBJS += ../Crypto/sha256-x86-nayuki.o + OBJS += ../Crypto/sha512-x86-nayuki.o else ifeq "$(CPU_ARCH)" "x64" OBJS += ../Crypto/Aes_x64.o OBJS += ../Crypto/Aes_hw_cpu.o OBJS += ../Crypto/Twofish_x64.o OBJS += ../Crypto/Camellia_x64.o OBJS += ../Crypto/Camellia_aesni_x64.o + OBJS += ../Crypto/sha512-x64-nayuki.o + OBJS += ../Crypto/sha256_avx1_x64.o + OBJS += ../Crypto/sha256_avx2_x64.o + OBJS += ../Crypto/sha256_sse4_x64.o + OBJS += ../Crypto/sha512_avx1_x64.o + OBJS += ../Crypto/sha512_avx2_x64.o + OBJS += ../Crypto/sha512_sse4_x64.o else OBJS += ../Crypto/Aescrypt.o endif @@ -87,6 +104,33 @@ ifeq "$(PLATFORM)" "MacOSX" ../Crypto/Camellia_aesni_asm.oo: ../Crypto/Camellia_aesni_x64.S @echo Assembling $(