VeraCrypt
aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMounir IDRASSI <mounir.idrassi@idrix.fr>2017-06-23 02:07:32 +0200
committerMounir IDRASSI <mounir.idrassi@idrix.fr>2017-06-23 02:11:21 +0200
commit546d6cff4447a56bbf7c0e1a8b6f89dba5d3183b (patch)
tree7f8bfb3f7e7c6a0aab662fe6dec944cd6ee1a874
parentab7b5dc685eab3235dd748d8791cb39085ab0394 (diff)
downloadVeraCrypt-546d6cff4447a56bbf7c0e1a8b6f89dba5d3183b.tar.gz
VeraCrypt-546d6cff4447a56bbf7c0e1a8b6f89dba5d3183b.zip
Crypto: Add optimized SHA-512 and SHA-256 assembly implementations for x86_64 and x86. This improves speed by 30%.
-rw-r--r--src/Common/Pkcs5.c21
-rw-r--r--src/Common/Tests.c4
-rw-r--r--src/Crypto/Crypto.vcxproj83
-rw-r--r--src/Crypto/Crypto.vcxproj.filters27
-rw-r--r--src/Crypto/Makefile.inc33
-rw-r--r--src/Crypto/Sha2.c1401
-rw-r--r--src/Crypto/Sha2.h151
-rw-r--r--src/Crypto/Sources21
-rw-r--r--src/Crypto/sha256-x64-nayuki.S6
-rw-r--r--src/Crypto/sha256-x86-nayuki.S168
-rw-r--r--src/Crypto/sha256_avx1_x64.asm596
-rw-r--r--src/Crypto/sha256_avx1_x86.asm10
-rw-r--r--src/Crypto/sha256_avx2_x64.asm840
-rw-r--r--src/Crypto/sha256_avx2_x86.asm10
-rw-r--r--src/Crypto/sha256_sse4_x64.asm560
-rw-r--r--src/Crypto/sha256_sse4_x86.asm10
-rw-r--r--src/Crypto/sha512-x64-nayuki.S202
-rw-r--r--src/Crypto/sha512-x86-nayuki.S180
-rw-r--r--src/Crypto/sha512_avx1_x64.asm427
-rw-r--r--src/Crypto/sha512_avx1_x86.asm10
-rw-r--r--src/Crypto/sha512_avx2_x64.asm804
-rw-r--r--src/Crypto/sha512_avx2_x86.asm10
-rw-r--r--src/Crypto/sha512_sse4_x64.asm416
-rw-r--r--src/Crypto/sha512_sse4_x86.asm10
-rw-r--r--src/Driver/DriveFilter.c4
-rw-r--r--src/Driver/Driver.vcxproj27
-rw-r--r--src/Driver/Driver.vcxproj.filters27
-rw-r--r--src/Volume/Volume.make44
28 files changed, 5313 insertions, 789 deletions
diff --git a/src/Common/Pkcs5.c b/src/Common/Pkcs5.c
index 1da5e237..c33f1dab 100644
--- a/src/Common/Pkcs5.c
+++ b/src/Common/Pkcs5.c
@@ -327,6 +327,12 @@ void hmac_sha512
char* buf = hmac.k;
int b;
char key[SHA512_DIGESTSIZE];
+#if defined (DEVICE_DRIVER) && !defined (_WIN64)
+ KFLOATING_SAVE floatingPointState;
+ NTSTATUS saveStatus = STATUS_SUCCESS;
+ if (HasSSE2() && HasMMX())
+ saveStatus = KeSaveFloatingPointState (&floatingPointState);
+#endif
/* If the key is longer than the hash algorithm block size,
let key = sha512(key), as per HMAC specifications. */
@@ -369,6 +375,11 @@ void hmac_sha512
hmac_sha512_internal (d, ld, &hmac);
+#if defined (DEVICE_DRIVER) && !defined (_WIN64)
+ if (NT_SUCCESS (saveStatus) && (HasSSE2() && HasMMX()))
+ KeRestoreFloatingPointState (&floatingPointState);
+#endif
+
/* Prevent leaks */
burn (&hmac, sizeof(hmac));
burn (key, sizeof(key));
@@ -408,6 +419,12 @@ void derive_key_sha512 (char *pwd, int pwd_len, char *salt, int salt_len, uint32
char* buf = hmac.k;
int b, l, r;
char key[SHA512_DIGESTSIZE];
+#if defined (DEVICE_DRIVER) && !defined (_WIN64)
+ KFLOATING_SAVE floatingPointState;
+ NTSTATUS saveStatus = STATUS_SUCCESS;
+ if (HasSSE2() && HasMMX())
+ saveStatus = KeSaveFloatingPointState (&floatingPointState);
+#endif
/* If the password is longer than the hash algorithm block size,
let pwd = sha512(pwd), as per HMAC specifications. */
@@ -471,6 +488,10 @@ void derive_key_sha512 (char *pwd, int pwd_len, char *salt, int salt_len, uint32
derive_u_sha512 (salt, salt_len, iterations, b, &hmac);
memcpy (dk, hmac.u, r);
+#if defined (DEVICE_DRIVER) && !defined (_WIN64)
+ if (NT_SUCCESS (saveStatus) && (HasSSE2() && HasMMX()))
+ KeRestoreFloatingPointState (&floatingPointState);
+#endif
/* Prevent possible leaks. */
burn (&hmac, sizeof(hmac));
diff --git a/src/Common/Tests.c b/src/Common/Tests.c
index cf0c8699..c70954a6 100644
--- a/src/Common/Tests.c
+++ b/src/Common/Tests.c
@@ -584,7 +584,7 @@ BOOL RunHashTest (HashFunction fn, HashTestVector* vector, BOOL bUseSSE)
#if defined (DEVICE_DRIVER) && !defined (_WIN64)
KFLOATING_SAVE floatingPointState;
NTSTATUS saveStatus = STATUS_SUCCESS;
- if (bUseSSE && (HasSSE2() || HasSSE41()))
+ if (bUseSSE && (HasISSE() || HasSSE2()))
saveStatus = KeSaveFloatingPointState (&floatingPointState);
#endif
while (vector[i].hexInput && vector[i].hexOutput)
@@ -601,7 +601,7 @@ BOOL RunHashTest (HashFunction fn, HashTestVector* vector, BOOL bUseSSE)
}
#if defined (DEVICE_DRIVER) && !defined (_WIN64)
- if (NT_SUCCESS (saveStatus) && bUseSSE && (HasSSE2() || HasSSE41()))
+ if (NT_SUCCESS (saveStatus) && bUseSSE && (HasISSE() || HasSSE2()))
KeRestoreFloatingPointState (&floatingPointState);
#endif
diff --git a/src/Crypto/Crypto.vcxproj b/src/Crypto/Crypto.vcxproj
index d7b686b1..c57f54d0 100644
--- a/src/Crypto/Crypto.vcxproj
+++ b/src/Crypto/Crypto.vcxproj
@@ -284,6 +284,89 @@
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
</CustomBuild>
</ItemGroup>
+ <ItemGroup>
+ <CustomBuild Include="sha256-x86-nayuki.S">
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">echo %(Filename)%(Extension) &amp; vsyasm.exe -Xvc -p gas -D WINABI -f win32 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">echo %(Filename)%(Extension) &amp; vsyasm.exe -Xvc -p gas -D WINABI -f win32 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="sha256_avx1_x64.asm">
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="sha256_avx2_x64.asm">
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="sha256_sse4_x64.asm">
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="sha512-x86-nayuki.S">
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">echo %(Filename)%(Extension) &amp; vsyasm.exe -Xvc -p gas -D WINABI -f win32 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">echo %(Filename)%(Extension) &amp; vsyasm.exe -Xvc -p gas -D WINABI -f win32 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="sha512-x64-nayuki.S">
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -Xvc -p gas -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -Xvc -p gas -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="sha512_avx1_x64.asm">
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="sha512_avx2_x64.asm">
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="sha512_sse4_x64.asm">
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">echo %(Filename)%(Extension) &amp; yasm.exe -D WINABI -f x64 -o "$(TargetDir)\%(Filename).obj" -l "$(TargetDir)\%(Filename).lst" "%(FullPath)"</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)\%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ </ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
diff --git a/src/Crypto/Crypto.vcxproj.filters b/src/Crypto/Crypto.vcxproj.filters
index d94e0bc4..b0122300 100644
--- a/src/Crypto/Crypto.vcxproj.filters
+++ b/src/Crypto/Crypto.vcxproj.filters
@@ -130,5 +130,32 @@
<CustomBuild Include="Camellia_aesni_x64.S">
<Filter>Source Files</Filter>
</CustomBuild>
+ <CustomBuild Include="sha512-x64-nayuki.S">
+ <Filter>Source Files</Filter>
+ </CustomBuild>
+ <CustomBuild Include="sha256-x86-nayuki.S">
+ <Filter>Source Files</Filter>
+ </CustomBuild>
+ <CustomBuild Include="sha256_avx1_x64.asm">
+ <Filter>Source Files</Filter>
+ </CustomBuild>
+ <CustomBuild Include="sha256_avx2_x64.asm">
+ <Filter>Source Files</Filter>
+ </CustomBuild>
+ <CustomBuild Include="sha256_sse4_x64.asm">
+ <Filter>Source Files</Filter>
+ </CustomBuild>
+ <CustomBuild Include="sha512-x86-nayuki.S">
+ <Filter>Source Files</Filter>
+ </CustomBuild>
+ <CustomBuild Include="sha512_avx1_x64.asm">
+ <Filter>Source Files</Filter>
+ </CustomBuild>
+ <CustomBuild Include="sha512_avx2_x64.asm">
+ <Filter>Source Files</Filter>
+ </CustomBuild>
+ <CustomBuild Include="sha512_sse4_x64.asm">
+ <Filter>Source Files</Filter>
+ </CustomBuild>
</ItemGroup>
</Project> \ No newline at end of file
diff --git a/src/Crypto/Makefile.inc b/src/Crypto/Makefile.inc
index 9fecd39e..b1db4434 100644
--- a/src/Crypto/Makefile.inc
+++ b/src/Crypto/Makefile.inc
@@ -1,9 +1,9 @@
TC_ASFLAGS = -Xvc -Ox
-VC_YASMFLAGS = -Xvc -p gas -D WINABI
+VC_YASMFLAGS = -Xvc -D WINABI
!if "$(TC_ARCH)" == "x86"
TC_ASFLAGS = $(TC_ASFLAGS) -f win32 --prefix _ -D MS_STDCALL -D DLL_EXPORT
-VC_YASMFLAGS = $(VC_YASMFLAGS) -f win32
+VC_YASMFLAGS = $(VC_YASMFLAGS) -f win32 -D MS_STDCALL
!else
TC_ASFLAGS = $(TC_ASFLAGS) -f win64
VC_YASMFLAGS = $(VC_YASMFLAGS) -f win64
@@ -21,11 +21,34 @@ TC_ASM_ERR_LOG = ..\Driver\build_errors_asm.log
nasm.exe $(TC_ASFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Aes_hw_cpu.lst" Aes_hw_cpu.asm 2>$(TC_ASM_ERR_LOG)
"$(OBJ_PATH)\$(O)\Twofish_$(TC_ARCH).obj": Twofish_$(TC_ARCH).S
- yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Twofish_$(TC_ARCH).lst" Twofish_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG)
+ yasm.exe $(VC_YASMFLAGS) -p gas -o "$@" -l "$(OBJ_PATH)\$(O)\Twofish_$(TC_ARCH).lst" Twofish_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG)
"$(OBJ_PATH)\$(O)\Camellia_$(TC_ARCH).obj": Camellia_$(TC_ARCH).S
- yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Camellia_$(TC_ARCH).lst" Camellia_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG)
+ yasm.exe $(VC_YASMFLAGS) -p gas -o "$@" -l "$(OBJ_PATH)\$(O)\Camellia_$(TC_ARCH).lst" Camellia_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG)
"$(OBJ_PATH)\$(O)\Camellia_aesni_$(TC_ARCH).obj": Camellia_aesni_$(TC_ARCH).S
- yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Camellia_aesni_$(TC_ARCH).lst" Camellia_aesni_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG)
+ yasm.exe $(VC_YASMFLAGS) -p gas -o "$@" -l "$(OBJ_PATH)\$(O)\Camellia_aesni_$(TC_ARCH).lst" Camellia_aesni_$(TC_ARCH).S 2>$(TC_ASM_ERR_LOG)
+"$(OBJ_PATH)\$(O)\sha256-$(TC_ARCH)-nayuki.obj": sha256-$(TC_ARCH)-nayuki.S
+ yasm.exe $(VC_YASMFLAGS) -p gas -o "$@" -l "$(OBJ_PATH)\$(O)\sha256-$(TC_ARCH)-nayuki.lst" sha256-$(TC_ARCH)-nayuki.S 2>$(TC_ASM_ERR_LOG)
+
+"$(OBJ_PATH)\$(O)\sha512-$(TC_ARCH)-nayuki.obj": sha512-$(TC_ARCH)-nayuki.S
+ yasm.exe $(VC_YASMFLAGS) -p gas -o "$@" -l "$(OBJ_PATH)\$(O)\sha512-$(TC_ARCH)-nayuki.lst" sha512-$(TC_ARCH)-nayuki.S 2>$(TC_ASM_ERR_LOG)
+
+"$(OBJ_PATH)\$(O)\sha512_avx1_$(TC_ARCH).obj": sha512_avx1_$(TC_ARCH).asm
+ yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha512_avx1_$(TC_ARCH).lst" sha512_avx1_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG)
+
+"$(OBJ_PATH)\$(O)\sha512_avx2_$(TC_ARCH).obj": sha512_avx2_$(TC_ARCH).asm
+ yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha512_avx2_$(TC_ARCH).lst" sha512_avx2_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG)
+
+"$(OBJ_PATH)\$(O)\sha512_sse4_$(TC_ARCH).obj": sha512_sse4_$(TC_ARCH).asm
+ yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha512_sse4_$(TC_ARCH).lst" sha512_sse4_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG)
+
+"$(OBJ_PATH)\$(O)\sha256_avx1_$(TC_ARCH).obj": sha256_avx1_$(TC_ARCH).asm
+ yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha256_avx1_$(TC_ARCH).lst" sha256_avx1_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG)
+
+"$(OBJ_PATH)\$(O)\sha256_avx2_$(TC_ARCH).obj": sha256_avx2_$(TC_ARCH).asm
+ yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha256_avx2_$(TC_ARCH).lst" sha256_avx2_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG)
+
+"$(OBJ_PATH)\$(O)\sha256_sse4_$(TC_ARCH).obj": sha256_sse4_$(TC_ARCH).asm
+ yasm.exe $(VC_YASMFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\sha256_sse4_$(TC_ARCH).lst" sha256_sse4_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG)
diff --git a/src/Crypto/Sha2.c b/src/Crypto/Sha2.c
index 9dbb529f..05da532e 100644
--- a/src/Crypto/Sha2.c
+++ b/src/Crypto/Sha2.c
@@ -1,767 +1,860 @@
/*
- ---------------------------------------------------------------------------
- Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
- 1. source code distributions include the above copyright notice, this
- list of conditions and the following disclaimer;
-
- 2. binary distributions include the above copyright notice, this list
- of conditions and the following disclaimer in their documentation;
-
- 3. the name of the copyright holder is not used to endorse products
- built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 01/08/2005
-
- This is a byte oriented version of SHA2 that operates on arrays of bytes
- stored in memory. This code implements sha256, sha384 and sha512 but the
- latter two functions rely on efficient 64-bit integer operations that
- may not be very efficient on 32-bit machines
-
- The sha256 functions use a type 'sha256_ctx' to hold details of the
- current hash state and uses the following three calls:
-
- void sha256_begin(sha256_ctx ctx[1])
- void sha256_hash(const unsigned char data[],
- unsigned long len, sha256_ctx ctx[1])
- void sha_end1(unsigned char hval[], sha256_ctx ctx[1])
-
- The first subroutine initialises a hash computation by setting up the
- context in the sha256_ctx context. The second subroutine hashes 8-bit
- bytes from array data[] into the hash state withinh sha256_ctx context,
- the number of bytes to be hashed being given by the the unsigned long
- integer len. The third subroutine completes the hash calculation and
- places the resulting digest value in the array of 8-bit bytes hval[].
-
- The sha384 and sha512 functions are similar and use the interfaces:
-
- void sha384_begin(sha384_ctx ctx[1]);
- void sha384_hash(const unsigned char data[],
- unsigned long len, sha384_ctx ctx[1]);
- void sha384_end(unsigned char hval[], sha384_ctx ctx[1]);
-
- void sha512_begin(sha512_ctx ctx[1]);
- void sha512_hash(const unsigned char data[],
- unsigned long len, sha512_ctx ctx[1]);
- void sha512_end(unsigned char hval[], sha512_ctx ctx[1]);
-
- In addition there is a function sha2 that can be used to call all these
- functions using a call with a hash length parameter as follows:
-
- int sha2_begin(unsigned long len, sha2_ctx ctx[1]);
- void sha2_hash(const unsigned char data[],
- unsigned long len, sha2_ctx ctx[1]);
- void sha2_end(unsigned char hval[], sha2_ctx ctx[1]);
-
- My thanks to Erik Andersen <andersen@codepoet.org> for testing this code
- on big-endian systems and for his assistance with corrections
+This code is written by kerukuro for cppcrypto library (http://cppcrypto.sourceforge.net/)
+and released into public domain.
*/
+/* Modified for VeraCrypt with speed optimization for C implementation */
+
+#include "Sha2.h"
#include "Common/Endian.h"
-#include "Common/Tcdefs.h"
+#include "Crypto/cpu.h"
#include "Crypto/misc.h"
-#define PLATFORM_BYTE_ORDER BYTE_ORDER
-#define IS_LITTLE_ENDIAN LITTLE_ENDIAN
-
-#if 0
-#define UNROLL_SHA2 /* for SHA2 loop unroll */
+#ifdef _UEFI
+#define NO_OPTIMIZED_VERSIONS
#endif
-#if !defined(_UEFI)
-#include <string.h> /* for memcpy() etc. */
-#endif // !defined(_UEFI)
-
-#include "Sha2.h"
+#ifndef NO_OPTIMIZED_VERSIONS
#if defined(__cplusplus)
extern "C"
{
#endif
-
-#if defined( _MSC_VER ) && ( _MSC_VER > 800 ) && !defined(_UEFI)
-#pragma intrinsic(memcpy)
+#if CRYPTOPP_BOOL_X64
+ void sha512_rorx(const void* M, void* D, uint_64t l);
+ void sha512_sse4(const void* M, uint_64t D[8], uint_64t l);
+ void sha512_avx(const void* M, void* D, uint_64t l);
#endif
-
-#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-#define SWAP_BYTES
-#else
-#undef SWAP_BYTES
+
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
+ void sha512_compress_nayuki(uint_64t state[8], const uint_8t block[128]);
#endif
-
-#if 0
-
-#define ch(x,y,z) (((x) & (y)) ^ (~(x) & (z)))
-#define maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
-
-#else /* Thanks to Rich Schroeppel and Colin Plumb for the following */
-
-#define ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
-#define maj(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y))))
-
+#if defined(__cplusplus)
+}
#endif
-/* round transforms for SHA256 and SHA512 compression functions */
-
-#define vf(n,i) v[(n - i) & 7]
-
-#define hf(i) (p[i & 15] += \
- g_1(p[(i + 14) & 15]) + p[(i + 9) & 15] + g_0(p[(i + 1) & 15]))
-
-#define v_cycle(i,j) \
- vf(7,i) += (j ? hf(i) : p[i]) + k_0[i+j] \
- + s_1(vf(4,i)) + ch(vf(4,i),vf(5,i),vf(6,i)); \
- vf(3,i) += vf(7,i); \
- vf(7,i) += s_0(vf(0,i))+ maj(vf(0,i),vf(1,i),vf(2,i))
-
-#if defined(SHA_224) || defined(SHA_256)
-
-#define SHA256_MASK (SHA256_BLOCK_SIZE - 1)
+#endif
-#if defined(SWAP_BYTES)
-#define bsw_32(p,n) \
- { int _i = (n); while(_i--) ((uint_32t*)p)[_i] = bswap_32(((uint_32t*)p)[_i]); }
-#else
-#define bsw_32(p,n)
-#endif
-
-#define s_0(x) (rotr32((x), 2) ^ rotr32((x), 13) ^ rotr32((x), 22))
-#define s_1(x) (rotr32((x), 6) ^ rotr32((x), 11) ^ rotr32((x), 25))
-#define g_0(x) (rotr32((x), 7) ^ rotr32((x), 18) ^ ((x) >> 3))
-#define g_1(x) (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10))
-#define k_0 k256
-
-/* rotated SHA256 round definition. Rather than swapping variables as in */
-/* FIPS-180, different variables are 'rotated' on each round, returning */
-/* to their starting positions every eight rounds */
-
-#define q(n) v##n
-
-#define one_cycle(a,b,c,d,e,f,g,h,k,w) \
- q(h) += s_1(q(e)) + ch(q(e), q(f), q(g)) + k + w; \
- q(d) += q(h); q(h) += s_0(q(a)) + maj(q(a), q(b), q(c))
-
-/* SHA256 mixing data */
-
-const uint_32t k256[64] =
-{ 0x428a2f98ul, 0x71374491ul, 0xb5c0fbcful, 0xe9b5dba5ul,
- 0x3956c25bul, 0x59f111f1ul, 0x923f82a4ul, 0xab1c5ed5ul,
- 0xd807aa98ul, 0x12835b01ul, 0x243185beul, 0x550c7dc3ul,
- 0x72be5d74ul, 0x80deb1feul, 0x9bdc06a7ul, 0xc19bf174ul,
- 0xe49b69c1ul, 0xefbe4786ul, 0x0fc19dc6ul, 0x240ca1ccul,
- 0x2de92c6ful, 0x4a7484aaul, 0x5cb0a9dcul, 0x76f988daul,
- 0x983e5152ul, 0xa831c66dul, 0xb00327c8ul, 0xbf597fc7ul,
- 0xc6e00bf3ul, 0xd5a79147ul, 0x06ca6351ul, 0x14292967ul,
- 0x27b70a85ul, 0x2e1b2138ul, 0x4d2c6dfcul, 0x53380d13ul,
- 0x650a7354ul, 0x766a0abbul, 0x81c2c92eul, 0x92722c85ul,
- 0xa2bfe8a1ul, 0xa81a664bul, 0xc24b8b70ul, 0xc76c51a3ul,
- 0xd192e819ul, 0xd6990624ul, 0xf40e3585ul, 0x106aa070ul,
- 0x19a4c116ul, 0x1e376c08ul, 0x2748774cul, 0x34b0bcb5ul,
- 0x391c0cb3ul, 0x4ed8aa4aul, 0x5b9cca4ful, 0x682e6ff3ul,
- 0x748f82eeul, 0x78a5636ful, 0x84c87814ul, 0x8cc70208ul,
- 0x90befffaul, 0xa4506cebul, 0xbef9a3f7ul, 0xc67178f2ul,
+typedef void (*transformFn)(sha512_ctx* ctx, void* m, uint_64t num_blks);
+
+transformFn transfunc = NULL;
+
+static const uint_64t K[80] = {
+ 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
+ 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
+ 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+ 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694,
+ 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
+ 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+ 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4,
+ 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70,
+ 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+ 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b,
+ 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30,
+ 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
+ 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
+ 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
+ 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
+ 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b,
+ 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
+ 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
+ 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
+ 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
};
-/* Compile 64 bytes of hash data into SHA256 digest value */
-/* NOTE: this routine assumes that the byte order in the */
-/* ctx->wbuf[] at this point is such that low address bytes */
-/* in the ORIGINAL byte stream will go into the high end of */
-/* words on BOTH big and little endian systems */
-VOID_RETURN sha256_compile(sha256_ctx ctx[1])
+#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y))))
+#define sum0(x) (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39))
+#define sum1(x) (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41))
+#define sigma0(x) (rotr64((x), 1) ^ rotr64((x), 8) ^ ((x) >> 7))
+#define sigma1(x) (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >> 6))
+
+#define WU(j) (W[j & 15] += sigma1(W[(j + 14) & 15]) + W[(j + 9) & 15] + sigma0(W[(j + 1) & 15]))
+
+#define COMPRESS_ROUND(i, j, K) \
+ T1 = h + sum1(e) + Ch(e, f, g) + K[i + j] + (i? WU(j): W[j]); \
+ T2 = sum0(a) + Maj(a, b, c); \
+ h = g; \
+ g = f; \
+ f = e; \
+ e = d + T1; \
+ d = c; \
+ c = b; \
+ b = a; \
+ a = T1 + T2;
+
+void StdTransform(sha512_ctx* ctx, void* mp, uint_64t num_blks)
{
-#if !defined(UNROLL_SHA2)
-
- uint_32t j, *p = ctx->wbuf, v[8];
-
- memcpy(v, ctx->hash, 8 * sizeof(uint_32t));
-
- for(j = 0; j < 64; j += 16)
- {
- v_cycle( 0, j); v_cycle( 1, j);
- v_cycle( 2, j); v_cycle( 3, j);
- v_cycle( 4, j); v_cycle( 5, j);
- v_cycle( 6, j); v_cycle( 7, j);
- v_cycle( 8, j); v_cycle( 9, j);
- v_cycle(10, j); v_cycle(11, j);
- v_cycle(12, j); v_cycle(13, j);
- v_cycle(14, j); v_cycle(15, j);
- }
-
- ctx->hash[0] += v[0]; ctx->hash[1] += v[1];
- ctx->hash[2] += v[2]; ctx->hash[3] += v[3];
- ctx->hash[4] += v[4]; ctx->hash[5] += v[5];
- ctx->hash[6] += v[6]; ctx->hash[7] += v[7];
+ uint_64t blk;
+ for (blk = 0; blk < num_blks; blk++)
+ {
+ uint_64t W[16];
+ uint_64t a,b,c,d,e,f,g,h;
+ uint_64t T1, T2;
+ int i;
+#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG)
+ int j;
+#endif
+ for (i = 0; i < 128 / 8; i++)
+ {
+ W[i] = bswap_64((((const uint_64t*)(mp))[blk * 16 + i]));
+ }
+
+ a = ctx->hash[0];
+ b = ctx->hash[1];
+ c = ctx->hash[2];
+ d = ctx->hash[3];
+ e = ctx->hash[4];
+ f = ctx->hash[5];
+ g = ctx->hash[6];
+ h = ctx->hash[7];
+
+ for (i = 0; i <= 79; i+=16)
+ {
+#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG)
+ for (j = 0; j < 16; j++)
+ {
+ COMPRESS_ROUND(i, j, K);
+ }
#else
-
- uint_32t *p = ctx->wbuf,v0,v1,v2,v3,v4,v5,v6,v7;
-
- v0 = ctx->hash[0]; v1 = ctx->hash[1];
- v2 = ctx->hash[2]; v3 = ctx->hash[3];
- v4 = ctx->hash[4]; v5 = ctx->hash[5];
- v6 = ctx->hash[6]; v7 = ctx->hash[7];
-
- one_cycle(0,1,2,3,4,5,6,7,k256[ 0],p[ 0]);
- one_cycle(7,0,1,2,3,4,5,6,k256[ 1],p[ 1]);
- one_cycle(6,7,0,1,2,3,4,5,k256[ 2],p[ 2]);
- one_cycle(5,6,7,0,1,2,3,4,k256[ 3],p[ 3]);
- one_cycle(4,5,6,7,0,1,2,3,k256[ 4],p[ 4]);
- one_cycle(3,4,5,6,7,0,1,2,k256[ 5],p[ 5]);
- one_cycle(2,3,4,5,6,7,0,1,k256[ 6],p[ 6]);
- one_cycle(1,2,3,4,5,6,7,0,k256[ 7],p[ 7]);
- one_cycle(0,1,2,3,4,5,6,7,k256[ 8],p[ 8]);
- one_cycle(7,0,1,2,3,4,5,6,k256[ 9],p[ 9]);
- one_cycle(6,7,0,1,2,3,4,5,k256[10],p[10]);
- one_cycle(5,6,7,0,1,2,3,4,k256[11],p[11]);
- one_cycle(4,5,6,7,0,1,2,3,k256[12],p[12]);
- one_cycle(3,4,5,6,7,0,1,2,k256[13],p[13]);
- one_cycle(2,3,4,5,6,7,0,1,k256[14],p[14]);
- one_cycle(1,2,3,4,5,6,7,0,k256[15],p[15]);
-
- one_cycle(0,1,2,3,4,5,6,7,k256[16],hf( 0));
- one_cycle(7,0,1,2,3,4,5,6,k256[17],hf( 1));
- one_cycle(6,7,0,1,2,3,4,5,k256[18],hf( 2));
- one_cycle(5,6,7,0,1,2,3,4,k256[19],hf( 3));
- one_cycle(4,5,6,7,0,1,2,3,k256[20],hf( 4));
- one_cycle(3,4,5,6,7,0,1,2,k256[21],hf( 5));
- one_cycle(2,3,4,5,6,7,0,1,k256[22],hf( 6));
- one_cycle(1,2,3,4,5,6,7,0,k256[23],hf( 7));
- one_cycle(0,1,2,3,4,5,6,7,k256[24],hf( 8));
- one_cycle(7,0,1,2,3,4,5,6,k256[25],hf( 9));
- one_cycle(6,7,0,1,2,3,4,5,k256[26],hf(10));
- one_cycle(5,6,7,0,1,2,3,4,k256[27],hf(11));
- one_cycle(4,5,6,7,0,1,2,3,k256[28],hf(12));
- one_cycle(3,4,5,6,7,0,1,2,k256[29],hf(13));
- one_cycle(2,3,4,5,6,7,0,1,k256[30],hf(14));
- one_cycle(1,2,3,4,5,6,7,0,k256[31],hf(15));
-
- one_cycle(0,1,2,3,4,5,6,7,k256[32],hf( 0));
- one_cycle(7,0,1,2,3,4,5,6,k256[33],hf( 1));
- one_cycle(6,7,0,1,2,3,4,5,k256[34],hf( 2));
- one_cycle(5,6,7,0,1,2,3,4,k256[35],hf( 3));
- one_cycle(4,5,6,7,0,1,2,3,k256[36],hf( 4));
- one_cycle(3,4,5,6,7,0,1,2,k256[37],hf( 5));
- one_cycle(2,3,4,5,6,7,0,1,k256[38],hf( 6));
- one_cycle(1,2,3,4,5,6,7,0,k256[39],hf( 7));
- one_cycle(0,1,2,3,4,5,6,7,k256[40],hf( 8));
- one_cycle(7,0,1,2,3,4,5,6,k256[41],hf( 9));
- one_cycle(6,7,0,1,2,3,4,5,k256[42],hf(10));
- one_cycle(5,6,7,0,1,2,3,4,k256[43],hf(11));
- one_cycle(4,5,6,7,0,1,2,3,k256[44],hf(12));
- one_cycle(3,4,5,6,7,0,1,2,k256[45],hf(13));
- one_cycle(2,3,4,5,6,7,0,1,k256[46],hf(14));
- one_cycle(1,2,3,4,5,6,7,0,k256[47],hf(15));
-
- one_cycle(0,1,2,3,4,5,6,7,k256[48],hf( 0));
- one_cycle(7,0,1,2,3,4,5,6,k256[49],hf( 1));
- one_cycle(6,7,0,1,2,3,4,5,k256[50],hf( 2));
- one_cycle(5,6,7,0,1,2,3,4,k256[51],hf( 3));
- one_cycle(4,5,6,7,0,1,2,3,k256[52],hf( 4));
- one_cycle(3,4,5,6,7,0,1,2,k256[53],hf( 5));
- one_cycle(2,3,4,5,6,7,0,1,k256[54],hf( 6));
- one_cycle(1,2,3,4,5,6,7,0,k256[55],hf( 7));
- one_cycle(0,1,2,3,4,5,6,7,k256[56],hf( 8));
- one_cycle(7,0,1,2,3,4,5,6,k256[57],hf( 9));
- one_cycle(6,7,0,1,2,3,4,5,k256[58],hf(10));
- one_cycle(5,6,7,0,1,2,3,4,k256[59],hf(11));
- one_cycle(4,5,6,7,0,1,2,3,k256[60],hf(12));
- one_cycle(3,4,5,6,7,0,1,2,k256[61],hf(13));
- one_cycle(2,3,4,5,6,7,0,1,k256[62],hf(14));
- one_cycle(1,2,3,4,5,6,7,0,k256[63],hf(15));
-
- ctx->hash[0] += v0; ctx->hash[1] += v1;
- ctx->hash[2] += v2; ctx->hash[3] += v3;
- ctx->hash[4] += v4; ctx->hash[5] += v5;
- ctx->hash[6] += v6; ctx->hash[7] += v7;
+ COMPRESS_ROUND(i, 0, K);
+ COMPRESS_ROUND(i, 1, K);
+ COMPRESS_ROUND(i , 2, K);
+ COMPRESS_ROUND(i, 3, K);
+ COMPRESS_ROUND(i, 4, K);
+ COMPRESS_ROUND(i, 5, K);
+ COMPRESS_ROUND(i, 6, K);
+ COMPRESS_ROUND(i, 7, K);
+ COMPRESS_ROUND(i, 8, K);
+ COMPRESS_ROUND(i, 9, K);
+ COMPRESS_ROUND(i, 10, K);
+ COMPRESS_ROUND(i, 11, K);
+ COMPRESS_ROUND(i, 12, K);
+ COMPRESS_ROUND(i, 13, K);
+ COMPRESS_ROUND(i, 14, K);
+ COMPRESS_ROUND(i, 15, K);
#endif
+ }
+ ctx->hash[0] += a;
+ ctx->hash[1] += b;
+ ctx->hash[2] += c;
+ ctx->hash[3] += d;
+ ctx->hash[4] += e;
+ ctx->hash[5] += f;
+ ctx->hash[6] += g;
+ ctx->hash[7] += h;
+ }
}
-/* SHA256 hash data in an array of bytes into hash buffer */
-/* and call the hash_compile function as required. */
-
-VOID_RETURN sha256_hash(const unsigned char data[], unsigned long len, sha256_ctx ctx[1])
-{ uint_32t pos = (uint_32t)(ctx->count[0] & SHA256_MASK),
- space = SHA256_BLOCK_SIZE - pos;
- const unsigned char *sp = data;
-
- if((ctx->count[0] += len) < len)
- ++(ctx->count[1]);
-
- while(len >= space) /* tranfer whole blocks while possible */
- {
- memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space);
- sp += space; len -= space; space = SHA256_BLOCK_SIZE; pos = 0;
- bsw_32(ctx->wbuf, SHA256_BLOCK_SIZE >> 2)
- sha256_compile(ctx);
- }
-
- memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len);
-}
-
-/* SHA256 Final padding and digest calculation */
-
-static void sha_end1(unsigned char hval[], sha256_ctx ctx[1], const unsigned int hlen)
-{ uint_32t i = (uint_32t)(ctx->count[0] & SHA256_MASK);
-
- /* put bytes in the buffer in an order in which references to */
- /* 32-bit words will put bytes with lower addresses into the */
- /* top of 32 bit words on BOTH big and little endian machines */
- bsw_32(ctx->wbuf, (i + 3) >> 2)
-
- /* we now need to mask valid bytes and add the padding which is */
- /* a single 1 bit and as many zero bits as necessary. Note that */
- /* we can always add the first padding byte here because the */
- /* buffer always has at least one empty slot */
- ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3);
- ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3);
-
- /* we need 9 or more empty positions, one for the padding byte */
- /* (above) and eight for the length count. If there is not */
- /* enough space pad and empty the buffer */
- if(i > SHA256_BLOCK_SIZE - 9)
- {
- if(i < 60) ctx->wbuf[15] = 0;
- sha256_compile(ctx);
- i = 0;
- }
- else /* compute a word index for the empty buffer positions */
- i = (i >> 2) + 1;
-
- while(i < 14) /* and zero pad all but last two positions */
- ctx->wbuf[i++] = 0;
-
- /* the following 32-bit length fields are assembled in the */
- /* wrong byte order on little endian machines but this is */
- /* corrected later since they are only ever used as 32-bit */
- /* word values. */
- ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29);
- ctx->wbuf[15] = ctx->count[0] << 3;
- sha256_compile(ctx);
-
- /* extract the hash value as bytes in case the hash buffer is */
- /* mislaigned for 32-bit words */
- for(i = 0; i < hlen; ++i)
- hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3)));
-}
-
-#endif
-
-#if defined(SHA_224)
+#ifndef NO_OPTIMIZED_VERSIONS
-const uint_32t i224[8] =
+#if CRYPTOPP_BOOL_X64
+void Avx2Transform(sha512_ctx* ctx, void* mp, uint_64t num_blks)
{
- 0xc1059ed8ul, 0x367cd507ul, 0x3070dd17ul, 0xf70e5939ul,
- 0xffc00b31ul, 0x68581511ul, 0x64f98fa7ul, 0xbefa4fa4ul
-};
+ if (num_blks > 1)
+ sha512_rorx(mp, ctx->hash, num_blks);
+ else
+ sha512_sse4(mp, ctx->hash, num_blks);
+}
-VOID_RETURN sha224_begin(sha224_ctx ctx[1])
+void AvxTransform(sha512_ctx* ctx, void* mp, uint_64t num_blks)
{
- ctx->count[0] = ctx->count[1] = 0;
- memcpy(ctx->hash, i224, 8 * sizeof(uint_32t));
+ if (num_blks > 1)
+ sha512_avx(mp, ctx->hash, num_blks);
+ else
+ sha512_sse4(mp, ctx->hash, num_blks);
}
-VOID_RETURN sha224_end(unsigned char hval[], sha224_ctx ctx[1])
+void SSE4Transform(sha512_ctx* ctx, void* mp, uint_64t num_blks)
{
- sha_end1(hval, ctx, SHA224_DIGEST_SIZE);
+ sha512_sse4(mp, ctx->hash, num_blks);
}
+#endif
-VOID_RETURN sha224(unsigned char hval[], const unsigned char data[], unsigned long len)
-{ sha224_ctx cx[1];
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
- sha224_begin(cx);
- sha224_hash(data, len, cx);
- sha_end1(hval, cx, SHA224_DIGEST_SIZE);
+void SSE2Transform(sha512_ctx* ctx, void* mp, uint_64t num_blks)
+{
+ uint_64t i;
+ for (i = 0; i < num_blks; i++)
+ sha512_compress_nayuki(ctx->hash, (uint_8t*)mp + i * 128);
}
#endif
-#if defined(SHA_256)
+#endif // NO_OPTIMIZED_VERSIONS
-const uint_32t i256[8] =
+void sha512_begin(sha512_ctx* ctx)
{
- 0x6a09e667ul, 0xbb67ae85ul, 0x3c6ef372ul, 0xa54ff53aul,
- 0x510e527ful, 0x9b05688cul, 0x1f83d9abul, 0x5be0cd19ul
-};
+ ctx->hash[0] = 0x6a09e667f3bcc908;
+ ctx->hash[1] = 0xbb67ae8584caa73b;
+ ctx->hash[2] = 0x3c6ef372fe94f82b;
+ ctx->hash[3] = 0xa54ff53a5f1d36f1;
+ ctx->hash[4] = 0x510e527fade682d1;
+ ctx->hash[5] = 0x9b05688c2b3e6c1f;
+ ctx->hash[6] = 0x1f83d9abfb41bd6b;
+ ctx->hash[7] = 0x5be0cd19137e2179;
+ ctx->count[0] = 0;
+ ctx->count[1] = 0;
+
+ if (!transfunc)
+ {
+#ifndef NO_OPTIMIZED_VERSIONS
+#if CRYPTOPP_BOOL_X64
+ if (g_isIntel&& HasSAVX2() && HasSBMI2())
+ transfunc = Avx2Transform;
+ else if (g_isIntel && HasSAVX())
+ {
+ transfunc = AvxTransform;
+ }
+ else if (HasSSE41())
+ {
+ transfunc = SSE4Transform;
+ }
+ else
+#endif
-VOID_RETURN sha256_begin(sha256_ctx ctx[1])
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
+ if (HasSSE2() && HasMMX())
+ transfunc = SSE2Transform;
+ else
+#endif
+
+#endif
+ transfunc = StdTransform;
+ }
+}
+
+void sha512_end(unsigned char * result, sha512_ctx* ctx)
{
- ctx->count[0] = ctx->count[1] = 0;
- memcpy(ctx->hash, i256, 8 * sizeof(uint_32t));
+ int i;
+ uint_64t mlen, pos = ctx->count[0];
+ uint_8t* m = (uint_8t*) ctx->wbuf;
+ m[pos++] = 0x80;
+ if (pos > 112)
+ {
+ memset(m + pos, 0, (size_t) (128 - pos));
+ transfunc(ctx, m, 1);
+ pos = 0;
+ }
+ memset(m + pos, 0, (size_t) (128 - pos));
+ mlen = bswap_64(ctx->count[1]);
+ memcpy(m + (128 - 8), &mlen, 64 / 8);
+ transfunc(ctx, m, 1);
+ for (i = 0; i < 8; i++)
+ {
+ ctx->hash[i] = bswap_64(ctx->hash[i]);
+ }
+ memcpy(result, ctx->hash, 64);
}
-VOID_RETURN sha256_end(unsigned char hval[], sha256_ctx ctx[1])
+void sha512_hash(const unsigned char * data, uint_64t len, sha512_ctx *ctx)
{
- sha_end1(hval, ctx, SHA256_DIGEST_SIZE);
+ uint_64t pos = ctx->count[0];
+ uint_64t total = ctx->count[1];
+ uint_8t* m = (uint_8t*) ctx->wbuf;
+ if (pos && pos + len >= 128)
+ {
+ memcpy(m + pos, data, (size_t) (128 - pos));
+ transfunc(ctx, m, 1);
+ len -= 128 - pos;
+ total += (128 - pos) * 8;
+ data += 128 - pos;
+ pos = 0;
+ }
+ if (len >= 128)
+ {
+ uint_64t blocks = len / 128;
+ uint_64t bytes = blocks * 128;
+ transfunc(ctx, (void*)data, blocks);
+ len -= bytes;
+ total += (bytes)* 8;
+ data += bytes;
+ }
+ memcpy(m+pos, data, (size_t) (len));
+ pos += len;
+ total += len * 8;
+ ctx->count[0] = pos;
+ ctx->count[1] = total;
}
-VOID_RETURN sha256(unsigned char hval[], const unsigned char data[], unsigned long len)
-{ sha256_ctx cx[1];
+void sha512(unsigned char * result, const unsigned char* source, uint_64t sourceLen)
+{
+ sha512_ctx ctx;
- sha256_begin(cx);
- sha256_hash(data, len, cx);
- sha_end1(hval, cx, SHA256_DIGEST_SIZE);
+ sha512_begin(&ctx);
+ sha512_hash(source, sourceLen, &ctx);
+ sha512_end(result, &ctx);
}
-#endif
+/////////////////////////////
-#if defined(SHA_384) || defined(SHA_512)
+#ifndef NO_OPTIMIZED_VERSIONS
-#define SHA512_MASK (SHA512_BLOCK_SIZE - 1)
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
-#if defined(SWAP_BYTES)
-#define bsw_64(p,n) \
- { int _i = (n); while(_i--) ((uint_64t*)p)[_i] = bswap_64(((uint_64t*)p)[_i]); }
-#else
-#define bsw_64(p,n)
+#if CRYPTOPP_BOOL_X64
+ void sha256_sse4(void *input_data, uint_32t digest[8], uint_64t num_blks);
+ void sha256_rorx(void *input_data, uint_32t digest[8], uint_64t num_blks);
+ void sha256_avx(void *input_data, uint_32t digest[8], uint_64t num_blks);
#endif
-/* SHA512 mixing function definitions */
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
+ void sha256_compress_nayuki(uint_32t state[8], const uint_8t block[64]);
+#endif
-#ifdef s_0
-# undef s_0
-# undef s_1
-# undef g_0
-# undef g_1
-# undef k_0
+#if defined(__cplusplus)
+}
#endif
-#define s_0(x) (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39))
-#define s_1(x) (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41))
-#define g_0(x) (rotr64((x), 1) ^ rotr64((x), 8) ^ ((x) >> 7))
-#define g_1(x) (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >> 6))
-#define k_0 k512
+#endif
-/* SHA384/SHA512 mixing data */
+CRYPTOPP_ALIGN_DATA(16) uint_32t SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+ };
+
+#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE))
+
+#ifdef _MSC_VER
+# pragma warning(disable: 4100 4731)
+#endif
-const uint_64t k512[80] =
+static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(uint_32t *state, const uint_32t *data, size_t len)
{
- li_64(428a2f98d728ae22), li_64(7137449123ef65cd),
- li_64(b5c0fbcfec4d3b2f), li_64(e9b5dba58189dbbc),
- li_64(3956c25bf348b538), li_64(59f111f1b605d019),
- li_64(923f82a4af194f9b), li_64(ab1c5ed5da6d8118),
- li_64(d807aa98a3030242), li_64(12835b0145706fbe),
- li_64(243185be4ee4b28c), li_64(550c7dc3d5ffb4e2),
- li_64(72be5d74f27b896f), li_64(80deb1fe3b1696b1),
- li_64(9bdc06a725c71235), li_64(c19bf174cf692694),
- li_64(e49b69c19ef14ad2), li_64(efbe4786384f25e3),
- li_64(0fc19dc68b8cd5b5), li_64(240ca1cc77ac9c65),
- li_64(2de92c6f592b0275), li_64(4a7484aa6ea6e483),
- li_64(5cb0a9dcbd41fbd4), li_64(76f988da831153b5),
- li_64(983e5152ee66dfab), li_64(a831c66d2db43210),
- li_64(b00327c898fb213f), li_64(bf597fc7beef0ee4),
- li_64(c6e00bf33da88fc2), li_64(d5a79147930aa725),
- li_64(06ca6351e003826f), li_64(142929670a0e6e70),
- li_64(27b70a8546d22ffc), li_64(2e1b21385c26c926),
- li_64(4d2c6dfc5ac42aed), li_64(53380d139d95b3df),
- li_64(650a73548baf63de), li_64(766a0abb3c77b2a8),
- li_64(81c2c92e47edaee6), li_64(92722c851482353b),
- li_64(a2bfe8a14cf10364), li_64(a81a664bbc423001),
- li_64(c24b8b70d0f89791), li_64(c76c51a30654be30),
- li_64(d192e819d6ef5218), li_64(d69906245565a910),
- li_64(f40e35855771202a), li_64(106aa07032bbd1b8),
- li_64(19a4c116b8d2d0c8), li_64(1e376c085141ab53),
- li_64(2748774cdf8eeb99), li_64(34b0bcb5e19b48a8),
- li_64(391c0cb3c5c95a63), li_64(4ed8aa4ae3418acb),
- li_64(5b9cca4f7763e373), li_64(682e6ff3d6b2b8a3),
- li_64(748f82ee5defb2fc), li_64(78a5636f43172f60),
- li_64(84c87814a1f0ab72), li_64(8cc702081a6439ec),
- li_64(90befffa23631e28), li_64(a4506cebde82bde9),
- li_64(bef9a3f7b2c67915), li_64(c67178f2e372532b),
- li_64(ca273eceea26619c), li_64(d186b8c721c0c207),
- li_64(eada7dd6cde0eb1e), li_64(f57d4f7fee6ed178),
- li_64(06f067aa72176fba), li_64(0a637dc5a2c898a6),
- li_64(113f9804bef90dae), li_64(1b710b35131c471b),
- li_64(28db77f523047d84), li_64(32caab7b40c72493),
- li_64(3c9ebe0a15c9bebc), li_64(431d67c49c100d4c),
- li_64(4cc5d4becb3e42b6), li_64(597f299cfc657e2a),
- li_64(5fcb6fab3ad6faec), li_64(6c44198c4a475817)
-};
+ #define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ
+ #define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4]
+ #define G(i) H(i+1)
+ #define F(i) H(i+2)
+ #define E(i) H(i+3)
+ #define D(i) H(i+4)
+ #define C(i) H(i+5)
+ #define B(i) H(i+6)
+ #define A(i) H(i+7)
+ #define Wt(i) BASE+8*4+ASM_MOD(1024+15-(i),16)*4
+ #define Wt_2(i) Wt((i)-2)
+ #define Wt_15(i) Wt((i)-15)
+ #define Wt_7(i) Wt((i)-7)
+ #define K_END [BASE+8*4+16*4+0*WORD_SZ]
+ #define STATE_SAVE [BASE+8*4+16*4+1*WORD_SZ]
+ #define DATA_SAVE [BASE+8*4+16*4+2*WORD_SZ]
+ #define DATA_END [BASE+8*4+16*4+3*WORD_SZ]
+ #define Kt(i) WORD_REG(si)+(i)*4
+#if CRYPTOPP_BOOL_X32
+ #define BASE esp+8
+#elif CRYPTOPP_BOOL_X86
+ #define BASE esp+4
+#elif defined(__GNUC__)
+ #define BASE r8
+#else
+ #define BASE rsp
+#endif
-/* Compile 128 bytes of hash data into SHA384/512 digest */
-/* NOTE: this routine assumes that the byte order in the */
-/* ctx->wbuf[] at this point is such that low address bytes */
-/* in the ORIGINAL byte stream will go into the high end of */
-/* words on BOTH big and little endian systems */
+#define RA0(i, edx, edi) \
+ AS2( add edx, [Kt(i)] )\
+ AS2( add edx, [Wt(i)] )\
+ AS2( add edx, H(i) )\
+
+#define RA1(i, edx, edi)
+
+#define RB0(i, edx, edi)
+
+#define RB1(i, edx, edi) \
+ AS2( mov AS_REG_7d, [Wt_2(i)] )\
+ AS2( mov edi, [Wt_15(i)])\
+ AS2( mov ebx, AS_REG_7d )\
+ AS2( shr AS_REG_7d, 10 )\
+ AS2( ror ebx, 17 )\
+ AS2( xor AS_REG_7d, ebx )\
+ AS2( ror ebx, 2 )\
+ AS2( xor ebx, AS_REG_7d )/* s1(W_t-2) */\
+ AS2( add ebx, [Wt_7(i)])\
+ AS2( mov AS_REG_7d, edi )\
+ AS2( shr AS_REG_7d, 3 )\
+ AS2( ror edi, 7 )\
+ AS2( add ebx, [Wt(i)])/* s1(W_t-2) + W_t-7 + W_t-16 */\
+ AS2( xor AS_REG_7d, edi )\
+ AS2( add edx, [Kt(i)])\
+ AS2( ror edi, 11 )\
+ AS2( add edx, H(i) )\
+ AS2( xor AS_REG_7d, edi )/* s0(W_t-15) */\
+ AS2( add AS_REG_7d, ebx )/* W_t = s1(W_t-2) + W_t-7 + s0(W_t-15) W_t-16*/\
+ AS2( mov [Wt(i)], AS_REG_7d)\
+ AS2( add edx, AS_REG_7d )\
+
+#define ROUND(i, r, eax, ecx, edi, edx)\
+ /* in: edi = E */\
+ /* unused: eax, ecx, temp: ebx, AS_REG_7d, out: edx = T1 */\
+ AS2( mov edx, F(i) )\
+ AS2( xor edx, G(i) )\
+ AS2( and edx, edi )\
+ AS2( xor edx, G(i) )/* Ch(E,F,G) = (G^(E&(F^G))) */\
+ AS2( mov AS_REG_7d, edi )\
+ AS2( ror edi, 6 )\
+ AS2( ror AS_REG_7d, 25 )\
+ RA##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\
+ AS2( xor AS_REG_7d, edi )\
+ AS2( ror edi, 5 )\
+ AS2( xor AS_REG_7d, edi )/* S1(E) */\
+ AS2( add edx, AS_REG_7d )/* T1 = S1(E) + Ch(E,F,G) + H + Wt + Kt */\
+ RB##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\
+ /* in: ecx = A, eax = B^C, edx = T1 */\
+ /* unused: edx, temp: ebx, AS_REG_7d, out: eax = A, ecx = B^C, edx = E */\
+ AS2( mov ebx, ecx )\
+ AS2( xor ecx, B(i) )/* A^B */\
+ AS2( and eax, ecx )\
+ AS2( xor eax, B(i) )/* Maj(A,B,C) = B^((A^B)&(B^C) */\
+ AS2( mov AS_REG_7d, ebx )\
+ AS2( ror ebx, 2 )\
+ AS2( add eax, edx )/* T1 + Maj(A,B,C) */\
+ AS2( add edx, D(i) )\
+ AS2( mov D(i), edx )\
+ AS2( ror AS_REG_7d, 22 )\
+ AS2( xor AS_REG_7d, ebx )\
+ AS2( ror ebx, 11 )\
+ AS2( xor AS_REG_7d, ebx )\
+ AS2( add eax, AS_REG_7d )/* T1 + S0(A) + Maj(A,B,C) */\
+ AS2( mov H(i), eax )\
+
+// Unroll the use of CRYPTOPP_BOOL_X64 in assembler math. The GAS assembler on X32 (version 2.25)
+// complains "Error: invalid operands (*ABS* and *UND* sections) for `*` and `-`"
+#if CRYPTOPP_BOOL_X64
+#define SWAP_COPY(i) \
+ AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\
+ AS1( bswap WORD_REG(bx))\
+ AS2( mov [Wt(i*2+1)], WORD_REG(bx))
+#else // X86 and X32
+#define SWAP_COPY(i) \
+ AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\
+ AS1( bswap WORD_REG(bx))\
+ AS2( mov [Wt(i)], WORD_REG(bx))
+#endif
-VOID_RETURN sha512_compile(sha512_ctx ctx[1])
-{ uint_64t v[8], *p = ctx->wbuf;
- uint_32t j;
-#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG)
- uint_32t i;
+#if defined(__GNUC__)
+ #if CRYPTOPP_BOOL_X64
+ CRYPTOPP_ALIGN_DATA(16) byte workspace[LOCALS_SIZE] ;
+ #endif
+ __asm__ __volatile__
+ (
+ #if CRYPTOPP_BOOL_X64
+ "lea %4, %%r8;"
+ #endif
+ INTEL_NOPREFIX
#endif
- memcpy(v, ctx->hash, 8 * sizeof(uint_64t));
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
+ #ifndef __GNUC__
+ AS2( mov edi, [len])
+ AS2( lea WORD_REG(si), [SHA256_K+48*4])
+ #endif
+ #if !defined(_MSC_VER) || (_MSC_VER < 1400)
+ AS_PUSH_IF86(bx)
+ #endif
+
+ AS_PUSH_IF86(bp)
+ AS2( mov ebx, esp)
+ AS2( and esp, -16)
+ AS2( sub WORD_REG(sp), LOCALS_SIZE)
+ AS_PUSH_IF86(bx)
+#endif
+ AS2( mov STATE_SAVE, WORD_REG(cx))
+ AS2( mov DATA_SAVE, WORD_REG(dx))
+ AS2( lea WORD_REG(ax), [WORD_REG(di) + WORD_REG(dx)])
+ AS2( mov DATA_END, WORD_REG(ax))
+ AS2( mov K_END, WORD_REG(si))
+
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
+ AS2( test edi, 1)
+ ASJ( jnz, 2, f)
+ AS1( dec DWORD PTR K_END)
+#endif
+ AS2( movdqa xmm0, XMMWORD_PTR [WORD_REG(cx)+0*16])
+ AS2( movdqa xmm1, XMMWORD_PTR [WORD_REG(cx)+1*16])
+#endif
- for(j = 0; j < 80; j += 16)
- {
-#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG)
- for (i = 0; i < 16; i++)
- {
- v_cycle( i, j);
- }
-#else
- v_cycle( 0, j); v_cycle( 1, j);
- v_cycle( 2, j); v_cycle( 3, j);
- v_cycle( 4, j); v_cycle( 5, j);
- v_cycle( 6, j); v_cycle( 7, j);
- v_cycle( 8, j); v_cycle( 9, j);
- v_cycle(10, j); v_cycle(11, j);
- v_cycle(12, j); v_cycle(13, j);
- v_cycle(14, j); v_cycle(15, j);
-#endif
- }
-
- ctx->hash[0] += v[0]; ctx->hash[1] += v[1];
- ctx->hash[2] += v[2]; ctx->hash[3] += v[3];
- ctx->hash[4] += v[4]; ctx->hash[5] += v[5];
- ctx->hash[6] += v[6]; ctx->hash[7] += v[7];
-}
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ ASJ( jmp, 0, f)
+#endif
+ ASL(2) // non-SSE2
+ AS2( mov esi, ecx)
+ AS2( lea edi, A(0))
+ AS2( mov ecx, 8)
+ATT_NOPREFIX
+ AS1( rep movsd)
+INTEL_NOPREFIX
+ AS2( mov esi, K_END)
+ ASJ( jmp, 3, f)
+#endif
-/* Compile 128 bytes of hash data into SHA256 digest value */
-/* NOTE: this routine assumes that the byte order in the */
-/* ctx->wbuf[] at this point is in such an order that low */
-/* address bytes in the ORIGINAL byte stream placed in this */
-/* buffer will now go to the high end of words on BOTH big */
-/* and little endian systems */
-
-VOID_RETURN sha512_hash(const unsigned char data[], unsigned long len, sha512_ctx ctx[1])
-{ uint_32t pos = (uint_32t)(ctx->count[0] & SHA512_MASK),
- space = SHA512_BLOCK_SIZE - pos;
- const unsigned char *sp = data;
-
- if((ctx->count[0] += len) < len)
- ++(ctx->count[1]);
-
- while(len >= space) /* tranfer whole blocks while possible */
- {
- memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space);
- sp += space; len -= space; space = SHA512_BLOCK_SIZE; pos = 0;
- bsw_64(ctx->wbuf, SHA512_BLOCK_SIZE >> 3);
- sha512_compile(ctx);
- }
-
- memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len);
-}
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ ASL(0)
+ AS2( movdqa E(0), xmm1)
+ AS2( movdqa A(0), xmm0)
+#endif
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
+ ASL(3)
+#endif
+ AS2( sub WORD_REG(si), 48*4)
+ SWAP_COPY(0) SWAP_COPY(1) SWAP_COPY(2) SWAP_COPY(3)
+ SWAP_COPY(4) SWAP_COPY(5) SWAP_COPY(6) SWAP_COPY(7)
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
+ SWAP_COPY(8) SWAP_COPY(9) SWAP_COPY(10) SWAP_COPY(11)
+ SWAP_COPY(12) SWAP_COPY(13) SWAP_COPY(14) SWAP_COPY(15)
+#endif
+ AS2( mov edi, E(0)) // E
+ AS2( mov eax, B(0)) // B
+ AS2( xor eax, C(0)) // B^C
+ AS2( mov ecx, A(0)) // A
+
+ ROUND(0, 0, eax, ecx, edi, edx)
+ ROUND(1, 0, ecx, eax, edx, edi)
+ ROUND(2, 0, eax, ecx, edi, edx)
+ ROUND(3, 0, ecx, eax, edx, edi)
+ ROUND(4, 0, eax, ecx, edi, edx)
+ ROUND(5, 0, ecx, eax, edx, edi)
+ ROUND(6, 0, eax, ecx, edi, edx)
+ ROUND(7, 0, ecx, eax, edx, edi)
+ ROUND(8, 0, eax, ecx, edi, edx)
+ ROUND(9, 0, ecx, eax, edx, edi)
+ ROUND(10, 0, eax, ecx, edi, edx)
+ ROUND(11, 0, ecx, eax, edx, edi)
+ ROUND(12, 0, eax, ecx, edi, edx)
+ ROUND(13, 0, ecx, eax, edx, edi)
+ ROUND(14, 0, eax, ecx, edi, edx)
+ ROUND(15, 0, ecx, eax, edx, edi)
+
+ ASL(1)
+ AS2(add WORD_REG(si), 4*16)
+ ROUND(0, 1, eax, ecx, edi, edx)
+ ROUND(1, 1, ecx, eax, edx, edi)
+ ROUND(2, 1, eax, ecx, edi, edx)
+ ROUND(3, 1, ecx, eax, edx, edi)
+ ROUND(4, 1, eax, ecx, edi, edx)
+ ROUND(5, 1, ecx, eax, edx, edi)
+ ROUND(6, 1, eax, ecx, edi, edx)
+ ROUND(7, 1, ecx, eax, edx, edi)
+ ROUND(8, 1, eax, ecx, edi, edx)
+ ROUND(9, 1, ecx, eax, edx, edi)
+ ROUND(10, 1, eax, ecx, edi, edx)
+ ROUND(11, 1, ecx, eax, edx, edi)
+ ROUND(12, 1, eax, ecx, edi, edx)
+ ROUND(13, 1, ecx, eax, edx, edi)
+ ROUND(14, 1, eax, ecx, edi, edx)
+ ROUND(15, 1, ecx, eax, edx, edi)
+ AS2( cmp WORD_REG(si), K_END)
+ ATT_NOPREFIX
+ ASJ( jb, 1, b)
+ INTEL_NOPREFIX
+
+ AS2( mov WORD_REG(dx), DATA_SAVE)
+ AS2( add WORD_REG(dx), 64)
+ AS2( mov AS_REG_7, STATE_SAVE)
+ AS2( mov DATA_SAVE, WORD_REG(dx))
+
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
+ AS2( test DWORD PTR K_END, 1)
+ ASJ( jz, 4, f)
+#endif
+ AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_7+1*16])
+ AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_7+0*16])
+ AS2( paddd xmm1, E(0))
+ AS2( paddd xmm0, A(0))
+ AS2( movdqa [AS_REG_7+1*16], xmm1)
+ AS2( movdqa [AS_REG_7+0*16], xmm0)
+ AS2( cmp WORD_REG(dx), DATA_END)
+ ATT_NOPREFIX
+ ASJ( jb, 0, b)
+ INTEL_NOPREFIX
+#endif
-/* SHA384/512 Final padding and digest calculation */
-
-static void sha_end2(unsigned char hval[], sha512_ctx ctx[1], const unsigned int hlen)
-{ uint_32t i = (uint_32t)(ctx->count[0] & SHA512_MASK);
-
- /* put bytes in the buffer in an order in which references to */
- /* 32-bit words will put bytes with lower addresses into the */
- /* top of 32 bit words on BOTH big and little endian machines */
- bsw_64(ctx->wbuf, (i + 7) >> 3);
-
- /* we now need to mask valid bytes and add the padding which is */
- /* a single 1 bit and as many zero bits as necessary. Note that */
- /* we can always add the first padding byte here because the */
- /* buffer always has at least one empty slot */
- ctx->wbuf[i >> 3] &= li_64(ffffffffffffff00) << 8 * (~i & 7);
- ctx->wbuf[i >> 3] |= li_64(0000000000000080) << 8 * (~i & 7);
-
- /* we need 17 or more empty byte positions, one for the padding */
- /* byte (above) and sixteen for the length count. If there is */
- /* not enough space pad and empty the buffer */
- if(i > SHA512_BLOCK_SIZE - 17)
- {
- if(i < 120) ctx->wbuf[15] = 0;
- sha512_compile(ctx);
- i = 0;
- }
- else
- i = (i >> 3) + 1;
-
- while(i < 14)
- ctx->wbuf[i++] = 0;
-
- /* the following 64-bit length fields are assembled in the */
- /* wrong byte order on little endian machines but this is */
- /* corrected later since they are only ever used as 64-bit */
- /* word values. */
- ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 61);
- ctx->wbuf[15] = ctx->count[0] << 3;
- sha512_compile(ctx);
-
- /* extract the hash value as bytes in case the hash buffer is */
- /* misaligned for 32-bit words */
- for(i = 0; i < hlen; ++i)
- hval[i] = (unsigned char)(ctx->hash[i >> 3] >> (8 * (~i & 7)));
-}
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ ASJ( jmp, 5, f)
+ ASL(4) // non-SSE2
+#endif
+ AS2( add [AS_REG_7+0*4], ecx) // A
+ AS2( add [AS_REG_7+4*4], edi) // E
+ AS2( mov eax, B(0))
+ AS2( mov ebx, C(0))
+ AS2( mov ecx, D(0))
+ AS2( add [AS_REG_7+1*4], eax)
+ AS2( add [AS_REG_7+2*4], ebx)
+ AS2( add [AS_REG_7+3*4], ecx)
+ AS2( mov eax, F(0))
+ AS2( mov ebx, G(0))
+ AS2( mov ecx, H(0))
+ AS2( add [AS_REG_7+5*4], eax)
+ AS2( add [AS_REG_7+6*4], ebx)
+ AS2( add [AS_REG_7+7*4], ecx)
+ AS2( mov ecx, AS_REG_7d)
+ AS2( cmp WORD_REG(dx), DATA_END)
+ ASJ( jb, 2, b)
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ ASL(5)
+#endif
+#endif
+ AS_POP_IF86(sp)
+ AS_POP_IF86(bp)
+ #if !defined(_MSC_VER) || (_MSC_VER < 1400)
+ AS_POP_IF86(bx)
+ #endif
+
+#ifdef __GNUC__
+ ATT_PREFIX
+ :
+ : "c" (state), "d" (data), "S" (SHA256_K+48), "D" (len)
+ #if CRYPTOPP_BOOL_X64
+ , "m" (workspace[0])
+ #endif
+ : "memory", "cc", "%eax"
+ #if CRYPTOPP_BOOL_X64
+ , "%rbx", "%r8", "%r10"
+ #endif
+ );
#endif
+}
-#if defined(SHA_384)
+#endif // (defined(CRYPTOPP_X86_ASM_AVAILABLE))
-/* SHA384 initialisation data */
+#undef sum0
+#undef sum1
+#undef sigma0
+#undef sigma1
-const uint_64t i384[80] =
-{
- li_64(cbbb9d5dc1059ed8), li_64(629a292a367cd507),
- li_64(9159015a3070dd17), li_64(152fecd8f70e5939),
- li_64(67332667ffc00b31), li_64(8eb44a8768581511),
- li_64(db0c2e0d64f98fa7), li_64(47b5481dbefa4fa4)
-};
+#define sum0(x) (rotr32((x), 2) ^ rotr32((x), 13) ^ rotr32((x), 22))
+#define sum1(x) (rotr32((x), 6) ^ rotr32((x), 11) ^ rotr32((x), 25))
+#define sigma0(x) (rotr32((x), 7) ^ rotr32((x), 18) ^ ((x) >> 3))
+#define sigma1(x) (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10))
-VOID_RETURN sha384_begin(sha384_ctx ctx[1])
-{
- ctx->count[0] = ctx->count[1] = 0;
- memcpy(ctx->hash, i384, 8 * sizeof(uint_64t));
-}
-VOID_RETURN sha384_end(unsigned char hval[], sha384_ctx ctx[1])
-{
- sha_end2(hval, ctx, SHA384_DIGEST_SIZE);
-}
+typedef void (*sha256transformFn)(sha256_ctx* ctx, void* m, uint_64t num_blks);
-VOID_RETURN sha384(unsigned char hval[], const unsigned char data[], unsigned long len)
-{ sha384_ctx cx[1];
-
- sha384_begin(cx);
- sha384_hash(data, len, cx);
- sha_end2(hval, cx, SHA384_DIGEST_SIZE);
-}
+sha256transformFn sha256transfunc = NULL;
+void StdSha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
+{
+ uint_64t blk;
+ for (blk = 0; blk < num_blks; blk++)
+ {
+ uint_32t W[16];
+ uint_32t a,b,c,d,e,f,g,h;
+ uint_32t T1, T2;
+ int i;
+#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG)
+ int j;
#endif
-#if defined(SHA_512)
-
-/* SHA512 initialisation data */
+ for (i = 0; i < 64 / 4; i++)
+ {
+ W[i] = bswap_32((((const uint_32t*)(mp))[blk * 16 + i]));
+ }
+
+ a = ctx->hash[0];
+ b = ctx->hash[1];
+ c = ctx->hash[2];
+ d = ctx->hash[3];
+ e = ctx->hash[4];
+ f = ctx->hash[5];
+ g = ctx->hash[6];
+ h = ctx->hash[7];
+
+ for (i = 0; i <= 63; i+=16)
+ {
+#if defined (TC_WINDOWS_DRIVER) && defined (DEBUG)
+ for (j = 0; j < 16; j++)
+ {
+ COMPRESS_ROUND(i, j, SHA256_K);
+ }
+#else
+ COMPRESS_ROUND(i, 0, SHA256_K);
+ COMPRESS_ROUND(i, 1, SHA256_K);
+ COMPRESS_ROUND(i , 2, SHA256_K);
+ COMPRESS_ROUND(i, 3, SHA256_K);
+ COMPRESS_ROUND(i, 4, SHA256_K);
+ COMPRESS_ROUND(i, 5, SHA256_K);
+ COMPRESS_ROUND(i, 6, SHA256_K);
+ COMPRESS_ROUND(i, 7, SHA256_K);
+ COMPRESS_ROUND(i, 8, SHA256_K);
+ COMPRESS_ROUND(i, 9, SHA256_K);
+ COMPRESS_ROUND(i, 10, SHA256_K);
+ COMPRESS_ROUND(i, 11, SHA256_K);
+ COMPRESS_ROUND(i, 12, SHA256_K);
+ COMPRESS_ROUND(i, 13, SHA256_K);
+ COMPRESS_ROUND(i, 14, SHA256_K);
+ COMPRESS_ROUND(i, 15, SHA256_K);
+#endif
+ }
+ ctx->hash[0] += a;
+ ctx->hash[1] += b;
+ ctx->hash[2] += c;
+ ctx->hash[3] += d;
+ ctx->hash[4] += e;
+ ctx->hash[5] += f;
+ ctx->hash[6] += g;
+ ctx->hash[7] += h;
+ }
+}
-const uint_64t i512[80] =
-{
- li_64(6a09e667f3bcc908), li_64(bb67ae8584caa73b),
- li_64(3c6ef372fe94f82b), li_64(a54ff53a5f1d36f1),
- li_64(510e527fade682d1), li_64(9b05688c2b3e6c1f),
- li_64(1f83d9abfb41bd6b), li_64(5be0cd19137e2179)
-};
+#ifndef NO_OPTIMIZED_VERSIONS
-VOID_RETURN sha512_begin(sha512_ctx ctx[1])
+#if CRYPTOPP_BOOL_X64
+void Avx2Sha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
{
- ctx->count[0] = ctx->count[1] = 0;
- memcpy(ctx->hash, i512, 8 * sizeof(uint_64t));
+ if (num_blks > 1)
+ sha256_rorx(mp, ctx->hash, num_blks);
+ else
+ sha256_sse4(mp, ctx->hash, num_blks);
}
-VOID_RETURN sha512_end(unsigned char hval[], sha512_ctx ctx[1])
+void AvxSha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
{
- sha_end2(hval, ctx, SHA512_DIGEST_SIZE);
+ if (num_blks > 1)
+ sha256_avx(mp, ctx->hash, num_blks);
+ else
+ sha256_sse4(mp, ctx->hash, num_blks);
}
-VOID_RETURN sha512(unsigned char hval[], const unsigned char data[], unsigned long len)
-{ sha512_ctx cx[1];
-
- sha512_begin(cx);
- sha512_hash(data, len, cx);
- sha_end2(hval, cx, SHA512_DIGEST_SIZE);
+void SSE4Sha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
+{
+ sha256_sse4(mp, ctx->hash, num_blks);
}
#endif
-#if defined(SHA_2)
-
-#define CTX_224(x) ((x)->uu->ctx256)
-#define CTX_256(x) ((x)->uu->ctx256)
-#define CTX_384(x) ((x)->uu->ctx512)
-#define CTX_512(x) ((x)->uu->ctx512)
-
-/* SHA2 initialisation */
-
-INT_RETURN sha2_begin(unsigned long len, sha2_ctx ctx[1])
+#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE))
+void SSE2Sha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
{
- switch(len)
- {
-#if defined(SHA_224)
- case 224:
- case 28: CTX_256(ctx)->count[0] = CTX_256(ctx)->count[1] = 0;
- memcpy(CTX_256(ctx)->hash, i224, 32);
- ctx->sha2_len = 28; return EXIT_SUCCESS;
-#endif
-#if defined(SHA_256)
- case 256:
- case 32: CTX_256(ctx)->count[0] = CTX_256(ctx)->count[1] = 0;
- memcpy(CTX_256(ctx)->hash, i256, 32);
- ctx->sha2_len = 32; return EXIT_SUCCESS;
-#endif
-#if defined(SHA_384)
- case 384:
- case 48: CTX_384(ctx)->count[0] = CTX_384(ctx)->count[1] = 0;
- memcpy(CTX_384(ctx)->hash, i384, 64);
- ctx->sha2_len = 48; return EXIT_SUCCESS;
-#endif
-#if defined(SHA_512)
- case 512:
- case 64: CTX_512(ctx)->count[0] = CTX_512(ctx)->count[1] = 0;
- memcpy(CTX_512(ctx)->hash, i512, 64);
- ctx->sha2_len = 64; return EXIT_SUCCESS;
-#endif
- default: return EXIT_FAILURE;
- }
+ X86_SHA256_HashBlocks(ctx->hash, (const uint_32t*)mp, (size_t)(num_blks * 64));
}
+#endif
-VOID_RETURN sha2_hash(const unsigned char data[], unsigned long len, sha2_ctx ctx[1])
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
+void Sha256AsmTransform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
{
- switch(ctx->sha2_len)
- {
-#if defined(SHA_224)
- case 28: sha224_hash(data, len, CTX_224(ctx)); return;
-#endif
-#if defined(SHA_256)
- case 32: sha256_hash(data, len, CTX_256(ctx)); return;
-#endif
-#if defined(SHA_384)
- case 48: sha384_hash(data, len, CTX_384(ctx)); return;
+ uint_64t i;
+ for (i = 0; i < num_blks; i++)
+ sha256_compress_nayuki(ctx->hash, (uint_8t*)mp + i * 64);
+}
#endif
-#if defined(SHA_512)
- case 64: sha512_hash(data, len, CTX_512(ctx)); return;
+
#endif
- }
-}
-VOID_RETURN sha2_end(unsigned char hval[], sha2_ctx ctx[1])
+void sha256_begin(sha256_ctx* ctx)
{
- switch(ctx->sha2_len)
- {
-#if defined(SHA_224)
- case 28: sha_end1(hval, CTX_224(ctx), SHA224_DIGEST_SIZE); return;
+ ctx->hash[0] = 0x6a09e667;
+ ctx->hash[1] = 0xbb67ae85;
+ ctx->hash[2] = 0x3c6ef372;
+ ctx->hash[3] = 0xa54ff53a;
+ ctx->hash[4] = 0x510e527f;
+ ctx->hash[5] = 0x9b05688c;
+ ctx->hash[6] = 0x1f83d9ab;
+ ctx->hash[7] = 0x5be0cd19;
+ ctx->count[0] = 0;
+ ctx->count[1] = 0;
+
+ if (!sha256transfunc)
+ {
+#ifndef NO_OPTIMIZED_VERSIONS
+#ifdef _M_X64
+ if (g_isIntel && HasSAVX2() && HasSBMI2())
+ sha256transfunc = Avx2Sha256Transform;
+ else if (g_isIntel && HasSAVX())
+ sha256transfunc = AvxSha256Transform;
+ else if (HasSSE41())
+ sha256transfunc = SSE4Sha256Transform;
+ else
#endif
-#if defined(SHA_256)
- case 32: sha_end1(hval, CTX_256(ctx), SHA256_DIGEST_SIZE); return;
+
+#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE))
+ if (HasSSE2 ())
+ sha256transfunc = SSE2Sha256Transform;
+ else
#endif
-#if defined(SHA_384)
- case 48: sha_end2(hval, CTX_384(ctx), SHA384_DIGEST_SIZE); return;
+
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
+ sha256transfunc = Sha256AsmTransform;
+#else
+ sha256transfunc = StdSha256Transform;
#endif
-#if defined(SHA_512)
- case 64: sha_end2(hval, CTX_512(ctx), SHA512_DIGEST_SIZE); return;
+#else
+ sha256transfunc = StdSha256Transform;
#endif
- }
+ }
}
-INT_RETURN sha2(unsigned char hval[], unsigned long size,
- const unsigned char data[], unsigned long len)
-{ sha2_ctx cx[1];
+void sha256_end(unsigned char * result, sha256_ctx* ctx)
+{
+ int i;
+ uint_64t mlen, pos = ctx->count[0];
+ uint_8t* m = (uint_8t*) ctx->wbuf;
+ m[pos++] = 0x80;
+ if (pos > 56)
+ {
+ memset(m + pos, 0, (size_t) (64 - pos));
+ sha256transfunc(ctx, m, 1);
+ pos = 0;
+ }
+ memset(m + pos, 0, (size_t) (56 - pos));
+ mlen = bswap_64((uint_64t) ctx->count[1]);
+ memcpy(m + (64 - 8), &mlen, 64 / 8);
+ sha256transfunc(ctx, m, 1);
+ for (i = 0; i < 8; i++)
+ {
+ ctx->hash[i] = bswap_32(ctx->hash[i]);
+ }
+ memcpy(result, ctx->hash, 32);
+}
- if(sha2_begin(size, cx) == EXIT_SUCCESS)
- {
- sha2_hash(data, len, cx); sha2_end(hval, cx); return EXIT_SUCCESS;
- }
- else
- return EXIT_FAILURE;
+void sha256_hash(const unsigned char * data, uint_32t len, sha256_ctx *ctx)
+{
+ uint_32t pos = ctx->count[0];
+ uint_32t total = ctx->count[1];
+ uint_8t* m = (uint_8t*) ctx->wbuf;
+ if (pos && pos + len >= 64)
+ {
+ memcpy(m + pos, data, 64 - pos);
+ sha256transfunc(ctx, m, 1);
+ len -= 64 - pos;
+ total += (64 - pos) * 8;
+ data += 64 - pos;
+ pos = 0;
+ }
+ if (len >= 64)
+ {
+ uint_32t blocks = len / 64;
+ uint_32t bytes = blocks * 64;
+ sha256transfunc(ctx, (void*)data, blocks);
+ len -= bytes;
+ total += (bytes)* 8;
+ data += bytes;
+ }
+ memcpy(m+pos, data, len);
+ pos += len;
+ total += len * 8;
+ ctx->count[0] = pos;
+ ctx->count[1] = total;
}
-#endif
+void sha256(unsigned char * result, const unsigned char* source, uint_32t sourceLen)
+{
+ sha256_ctx ctx;
-#if defined(__cplusplus)
+ sha256_begin(&ctx);
+ sha256_hash(source, sourceLen, &ctx);
+ sha256_end(result, &ctx);
}
-#endif
diff --git a/src/Crypto/Sha2.h b/src/Crypto/Sha2.h
index 6d0aeb0f..37625ce8 100644
--- a/src/Crypto/Sha2.h
+++ b/src/Crypto/Sha2.h
@@ -1,155 +1,60 @@
/*
- ---------------------------------------------------------------------------
- Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
- 1. source code distributions include the above copyright notice, this
- list of conditions and the following disclaimer;
-
- 2. binary distributions include the above copyright notice, this list
- of conditions and the following disclaimer in their documentation;
-
- 3. the name of the copyright holder is not used to endorse products
- built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 01/08/2005
-*/
+ * Copyright (c) 2013-2017 IDRIX
+ * Governed by the Apache License 2.0 the full text of which is contained
+ * in the file License.txt included in VeraCrypt binary and source
+ * code distribution packages.
+ */
#ifndef _SHA2_H
#define _SHA2_H
#include "Common/Tcdefs.h"
#include "Common/Endian.h"
-
-#define SHA_64BIT
-
-/* define the hash functions that you need */
-#define SHA_2 /* for dynamic hash length */
-#define SHA_224
-#define SHA_256
-#ifdef SHA_64BIT
-# define SHA_384
-# define SHA_512
-# define NEED_UINT_64T
-#endif
-
-#ifndef EXIT_SUCCESS
-#define EXIT_SUCCESS 0
-#define EXIT_FAILURE 1
-#endif
-
-#define li_64(h) 0x##h##ull
-
-#define VOID_RETURN void
-#define INT_RETURN int
+#include "Crypto/config.h"
#if defined(__cplusplus)
-extern "C"
-{
+extern "C" {
#endif
-/* Note that the following function prototypes are the same */
-/* for both the bit and byte oriented implementations. But */
-/* the length fields are in bytes or bits as is appropriate */
-/* for the version used. Bit sequences are arrays of bytes */
-/* in which bit sequence indexes increase from the most to */
-/* the least significant end of each byte */
-
-#define SHA224_DIGEST_SIZE 28
-#define SHA224_BLOCK_SIZE 64
#define SHA256_DIGEST_SIZE 32
#define SHA256_BLOCK_SIZE 64
-/* type to hold the SHA256 (and SHA224) context */
-
-typedef struct
-{ uint_32t count[2];
- uint_32t hash[8];
- uint_32t wbuf[16];
-} sha256_ctx;
-
-typedef sha256_ctx sha224_ctx;
-
-VOID_RETURN sha256_compile(sha256_ctx ctx[1]);
-
-VOID_RETURN sha224_begin(sha224_ctx ctx[1]);
-#define sha224_hash sha256_hash
-VOID_RETURN sha224_end(unsigned char hval[], sha224_ctx ctx[1]);
-VOID_RETURN sha224(unsigned char hval[], const unsigned char data[], unsigned long len);
-
-VOID_RETURN sha256_begin(sha256_ctx ctx[1]);
-VOID_RETURN sha256_hash(const unsigned char data[], unsigned long len, sha256_ctx ctx[1]);
-VOID_RETURN sha256_end(unsigned char hval[], sha256_ctx ctx[1]);
-VOID_RETURN sha256(unsigned char hval[], const unsigned char data[], unsigned long len);
-
-#ifndef SHA_64BIT
-
-typedef struct
-{ union
- { sha256_ctx ctx256[1];
- } uu[1];
- uint_32t sha2_len;
-} sha2_ctx;
-
-#define SHA2_MAX_DIGEST_SIZE SHA256_DIGEST_SIZE
-
-#else
-
-#define SHA384_DIGEST_SIZE 48
-#define SHA384_BLOCK_SIZE 128
#define SHA512_DIGEST_SIZE 64
#define SHA512_BLOCK_SIZE 128
-#define SHA2_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE
-/* type to hold the SHA384 (and SHA512) context */
+#if CRYPTOPP_BOOL_X64
+#define SHA2_ALIGN CRYPTOPP_ALIGN_DATA(32)
+#else
+#define SHA2_ALIGN CRYPTOPP_ALIGN_DATA(16)
+#endif
typedef struct
{ uint_64t count[2];
- uint_64t hash[8];
- uint_64t wbuf[16];
+ SHA2_ALIGN uint_64t hash[8];
+ SHA2_ALIGN uint_64t wbuf[16];
} sha512_ctx;
-typedef sha512_ctx sha384_ctx;
-
typedef struct
-{ union
- { sha256_ctx ctx256[1];
- sha512_ctx ctx512[1];
- } uu[1];
- uint_32t sha2_len;
-} sha2_ctx;
-
-VOID_RETURN sha512_compile(sha512_ctx ctx[1]);
-
-VOID_RETURN sha384_begin(sha384_ctx ctx[1]);
-#define sha384_hash sha512_hash
-VOID_RETURN sha384_end(unsigned char hval[], sha384_ctx ctx[1]);
-VOID_RETURN sha384(unsigned char hval[], const unsigned char data[], unsigned long len);
+{ uint_32t count[2];
+ SHA2_ALIGN uint_32t hash[8];
+ SHA2_ALIGN uint_32t wbuf[16];
+} sha256_ctx;
-VOID_RETURN sha512_begin(sha512_ctx ctx[1]);
-VOID_RETURN sha512_hash(const unsigned char data[], unsigned long len, sha512_ctx ctx[1]);
-VOID_RETURN sha512_end(unsigned char hval[], sha512_ctx ctx[1]);
-VOID_RETURN sha512(unsigned char hval[], const unsigned char data[], unsigned long len);
-INT_RETURN sha2_begin(unsigned long size, sha2_ctx ctx[1]);
-VOID_RETURN sha2_hash(const unsigned char data[], unsigned long len, sha2_ctx ctx[1]);
-VOID_RETURN sha2_end(unsigned char hval[], sha2_ctx ctx[1]);
-INT_RETURN sha2(unsigned char hval[], unsigned long size, const unsigned char data[], unsigned long len);
+void sha512_begin(sha512_ctx* ctx);
+void sha512_hash(const unsigned char * source, uint_64t sourceLen, sha512_ctx *ctx);
+void sha512_end(unsigned char * result, sha512_ctx* ctx);
+void sha512(unsigned char * result, const unsigned char* source, uint_64t sourceLen);
-#endif
+void sha256_begin(sha256_ctx* ctx);
+void sha256_hash(const unsigned char * source, uint_32t sourceLen, sha256_ctx *ctx);
+void sha256_end(unsigned char * result, sha256_ctx* ctx);
+void sha256(unsigned char * result, const unsigned char* source, uint_32t sourceLen);
#if defined(__cplusplus)
}
#endif
+
+
#endif
diff --git a/src/Crypto/Sources b/src/Crypto/Sources
index 60412bf1..a93f9530 100644
--- a/src/Crypto/Sources
+++ b/src/Crypto/Sources
@@ -9,7 +9,15 @@ NTTARGETFILES = \
"$(OBJ_PATH)\$(O)\gost89_$(TC_ARCH).obj" \
"$(OBJ_PATH)\$(O)\Twofish_$(TC_ARCH).obj" \
"$(OBJ_PATH)\$(O)\Camellia_$(TC_ARCH).obj" \
- "$(OBJ_PATH)\$(O)\Camellia_aesni_$(TC_ARCH).obj"
+ "$(OBJ_PATH)\$(O)\Camellia_aesni_$(TC_ARCH).obj" \
+ "$(OBJ_PATH)\$(O)\sha256-$(TC_ARCH)-nayuki.obj" \
+ "$(OBJ_PATH)\$(O)\sha512-$(TC_ARCH)-nayuki.obj" \
+ "$(OBJ_PATH)\$(O)\sha512_avx1_$(TC_ARCH).obj" \
+ "$(OBJ_PATH)\$(O)\sha512_avx2_$(TC_ARCH).obj" \
+ "$(OBJ_PATH)\$(O)\sha512_sse4_$(TC_ARCH).obj" \
+ "$(OBJ_PATH)\$(O)\sha256_avx1_$(TC_ARCH).obj" \
+ "$(OBJ_PATH)\$(O)\sha256_avx2_$(TC_ARCH).obj" \
+ "$(OBJ_PATH)\$(O)\sha256_sse4_$(TC_ARCH).obj"
SOURCES = \
Aes_$(TC_ARCH).asm \
@@ -30,5 +38,12 @@ SOURCES = \
Whirlpool.c \
Camellia.c \
Camellia_$(TC_ARCH).S \
- Camellia_aesni_$(TC_ARCH).S
-
+ Camellia_aesni_$(TC_ARCH).S \
+ sha256-$(TC_ARCH)-nayuki.S \
+ sha512-$(TC_ARCH)-nayuki.S \
+ sha512_avx1_$(TC_ARCH).asm \
+ sha512_avx2_$(TC_ARCH).asm \
+ sha512_sse4_$(TC_ARCH).asm \
+ sha256_avx1_$(TC_ARCH).asm \
+ sha256_avx2_$(TC_ARCH).asm \
+ sha256_sse4_$(TC_ARCH).asm
diff --git a/src/Crypto/sha256-x64-nayuki.S b/src/Crypto/sha256-x64-nayuki.S
new file mode 100644
index 00000000..c6dd16d1
--- /dev/null
+++ b/src/Crypto/sha256-x64-nayuki.S
@@ -0,0 +1,6 @@
+
+ .ifndef WINABI
+#if defined(__linux__) && defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+#endif
+ .endif \ No newline at end of file
diff --git a/src/Crypto/sha256-x86-nayuki.S b/src/Crypto/sha256-x86-nayuki.S
new file mode 100644
index 00000000..a8e25db7
--- /dev/null
+++ b/src/Crypto/sha256-x86-nayuki.S
@@ -0,0 +1,168 @@
+/*
+ * SHA-256 hash in x86 assembly
+ *
+ * Copyright (c) 2014 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/fast-sha2-hashes-in-x86-assembly
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ * implied, including but not limited to the warranties of merchantability,
+ * fitness for a particular purpose and noninfringement. In no event shall the
+ * authors or copyright holders be liable for any claim, damages or other
+ * liability, whether in an action of contract, tort or otherwise, arising from,
+ * out of or in connection with the Software or the use or other dealings in the
+ * Software.
+ */
+
+
+/* void sha256_compress_nayuki(uint32_t state[8], const uint8_t block[64]) */
+
+ .ifdef MS_STDCALL
+ .globl _sha256_compress_nayuki@8
+ _sha256_compress_nayuki@8:
+ .else
+ .globl sha256_compress_nayuki
+ .globl _sha256_compress_nayuki
+ sha256_compress_nayuki:
+ _sha256_compress_nayuki:
+ .endif
+
+ /*
+ * Storage usage:
+ * Bytes Location Description
+ * 4 eax Temporary for calculation per round
+ * 4 ebx Temporary for calculation per round
+ * 4 ecx Temporary for calculation per round
+ * 4 edx Temporary for calculation per round
+ * 4 ebp Temporary for calculation per round
+ * 4 esi (During state loading and update) base address of state array argument
+ * (During hash rounds) temporary for calculation per round
+ * 4 edi Base address of block array argument (during key schedule loading rounds only)
+ * 4 esp x86 stack pointer
+ * 32 [esp+ 0] SHA-256 state variables A,B,C,D,E,F,G,H (4 bytes each)
+ * 64 [esp+ 32] Key schedule of 16 * 4 bytes
+ * 4 [esp+ 96] Caller's value of ebx
+ * 4 [esp+100] Caller's value of esi
+ * 4 [esp+104] Caller's value of edi
+ * 4 [esp+108] Caller's value of ebp
+ */
+
+ subl $112, %esp
+ movl %ebx, 96(%esp)
+ movl %esi, 100(%esp)
+ movl %edi, 104(%esp)
+ movl %ebp, 108(%esp)
+
+
+ movl 116(%esp), %esi
+ movl 0(%esi), %eax; movl %eax, 0(%esp)
+ movl 4(%esi), %eax; movl %eax, 4(%esp)
+ movl 8(%esi), %eax; movl %eax, 8(%esp)
+ movl 12(%esi), %eax; movl %eax, 12(%esp)
+ movl 16(%esi), %eax; movl %eax, 16(%esp)
+ movl 20(%esi), %eax; movl %eax, 20(%esp)
+ movl 24(%esi), %eax; movl %eax, 24(%esp)
+ movl 28(%esi), %eax; movl %eax, 28(%esp)
+
+
+ movl 120(%esp), %edi
+ movl (0*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((0)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x428A2F98(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp);
+ movl (1*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((1)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x71374491(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp);
+ movl (2*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((2)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xB5C0FBCF(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp);
+ movl (3*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((3)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xE9B5DBA5(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp);
+ movl (4*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((4)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x3956C25B(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp);
+ movl (5*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((5)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x59F111F1(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp);
+ movl (6*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((6)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x923F82A4(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp);
+ movl (7*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((7)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xAB1C5ED5(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp);
+ movl (8*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((8)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xD807AA98(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp);
+ movl (9*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((9)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x12835B01(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp);
+ movl (10*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((10)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x243185BE(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp);
+ movl (11*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((11)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x550C7DC3(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp);
+ movl (12*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((12)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x72BE5D74(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp);
+ movl (13*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((13)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x80DEB1FE(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp);
+ movl (14*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((14)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x9BDC06A7(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp);
+ movl (15*4)(%edi), %ebp; bswapl %ebp; movl %ebp, ((((15)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xC19BF174(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp);
+ movl ((((16 -15)&0xF)+8)*4)(%esp), %eax; movl ((((16 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((16 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((16 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((16)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xE49B69C1(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp);
+ movl ((((17 -15)&0xF)+8)*4)(%esp), %eax; movl ((((17 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((17 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((17 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((17)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xEFBE4786(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp);
+ movl ((((18 -15)&0xF)+8)*4)(%esp), %eax; movl ((((18 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((18 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((18 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((18)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x0FC19DC6(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp);
+ movl ((((19 -15)&0xF)+8)*4)(%esp), %eax; movl ((((19 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((19 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((19 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((19)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x240CA1CC(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp);
+ movl ((((20 -15)&0xF)+8)*4)(%esp), %eax; movl ((((20 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((20 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((20 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((20)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x2DE92C6F(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp);
+ movl ((((21 -15)&0xF)+8)*4)(%esp), %eax; movl ((((21 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((21 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((21 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((21)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x4A7484AA(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp);
+ movl ((((22 -15)&0xF)+8)*4)(%esp), %eax; movl ((((22 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((22 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((22 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((22)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x5CB0A9DC(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp);
+ movl ((((23 -15)&0xF)+8)*4)(%esp), %eax; movl ((((23 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((23 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((23 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((23)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x76F988DA(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp);
+ movl ((((24 -15)&0xF)+8)*4)(%esp), %eax; movl ((((24 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((24 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((24 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((24)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x983E5152(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp);
+ movl ((((25 -15)&0xF)+8)*4)(%esp), %eax; movl ((((25 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((25 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((25 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((25)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xA831C66D(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp);
+ movl ((((26 -15)&0xF)+8)*4)(%esp), %eax; movl ((((26 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((26 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((26 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((26)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xB00327C8(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp);
+ movl ((((27 -15)&0xF)+8)*4)(%esp), %eax; movl ((((27 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((27 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((27 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((27)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xBF597FC7(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp);
+ movl ((((28 -15)&0xF)+8)*4)(%esp), %eax; movl ((((28 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((28 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((28 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((28)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xC6E00BF3(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp);
+ movl ((((29 -15)&0xF)+8)*4)(%esp), %eax; movl ((((29 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((29 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((29 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((29)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xD5A79147(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp);
+ movl ((((30 -15)&0xF)+8)*4)(%esp), %eax; movl ((((30 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((30 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((30 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((30)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x06CA6351(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp);
+ movl ((((31 -15)&0xF)+8)*4)(%esp), %eax; movl ((((31 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((31 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((31 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((31)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x14292967(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp);
+ movl ((((32 -15)&0xF)+8)*4)(%esp), %eax; movl ((((32 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((32 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((32 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((32)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x27B70A85(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp);
+ movl ((((33 -15)&0xF)+8)*4)(%esp), %eax; movl ((((33 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((33 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((33 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((33)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x2E1B2138(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp);
+ movl ((((34 -15)&0xF)+8)*4)(%esp), %eax; movl ((((34 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((34 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((34 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((34)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x4D2C6DFC(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp);
+ movl ((((35 -15)&0xF)+8)*4)(%esp), %eax; movl ((((35 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((35 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((35 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((35)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x53380D13(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp);
+ movl ((((36 -15)&0xF)+8)*4)(%esp), %eax; movl ((((36 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((36 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((36 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((36)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x650A7354(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp);
+ movl ((((37 -15)&0xF)+8)*4)(%esp), %eax; movl ((((37 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((37 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((37 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((37)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x766A0ABB(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp);
+ movl ((((38 -15)&0xF)+8)*4)(%esp), %eax; movl ((((38 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((38 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((38 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((38)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x81C2C92E(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp);
+ movl ((((39 -15)&0xF)+8)*4)(%esp), %eax; movl ((((39 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((39 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((39 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((39)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x92722C85(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp);
+ movl ((((40 -15)&0xF)+8)*4)(%esp), %eax; movl ((((40 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((40 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((40 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((40)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xA2BFE8A1(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp);
+ movl ((((41 -15)&0xF)+8)*4)(%esp), %eax; movl ((((41 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((41 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((41 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((41)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xA81A664B(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp);
+ movl ((((42 -15)&0xF)+8)*4)(%esp), %eax; movl ((((42 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((42 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((42 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((42)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xC24B8B70(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp);
+ movl ((((43 -15)&0xF)+8)*4)(%esp), %eax; movl ((((43 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((43 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((43 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((43)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xC76C51A3(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp);
+ movl ((((44 -15)&0xF)+8)*4)(%esp), %eax; movl ((((44 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((44 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((44 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((44)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xD192E819(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp);
+ movl ((((45 -15)&0xF)+8)*4)(%esp), %eax; movl ((((45 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((45 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((45 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((45)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xD6990624(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp);
+ movl ((((46 -15)&0xF)+8)*4)(%esp), %eax; movl ((((46 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((46 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((46 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((46)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xF40E3585(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp);
+ movl ((((47 -15)&0xF)+8)*4)(%esp), %eax; movl ((((47 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((47 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((47 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((47)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x106AA070(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp);
+ movl ((((48 -15)&0xF)+8)*4)(%esp), %eax; movl ((((48 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((48 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((48 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((48)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x19A4C116(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp);
+ movl ((((49 -15)&0xF)+8)*4)(%esp), %eax; movl ((((49 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((49 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((49 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((49)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x1E376C08(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp);
+ movl ((((50 -15)&0xF)+8)*4)(%esp), %eax; movl ((((50 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((50 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((50 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((50)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x2748774C(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp);
+ movl ((((51 -15)&0xF)+8)*4)(%esp), %eax; movl ((((51 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((51 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((51 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((51)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x34B0BCB5(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp);
+ movl ((((52 -15)&0xF)+8)*4)(%esp), %eax; movl ((((52 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((52 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((52 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((52)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x391C0CB3(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp);
+ movl ((((53 -15)&0xF)+8)*4)(%esp), %eax; movl ((((53 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((53 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((53 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((53)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x4ED8AA4A(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp);
+ movl ((((54 -15)&0xF)+8)*4)(%esp), %eax; movl ((((54 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((54 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((54 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((54)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x5B9CCA4F(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp);
+ movl ((((55 -15)&0xF)+8)*4)(%esp), %eax; movl ((((55 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((55 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((55 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((55)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x682E6FF3(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp);
+ movl ((((56 -15)&0xF)+8)*4)(%esp), %eax; movl ((((56 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((56 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((56 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((56)&0xF)+8)*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (7*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (6*4)(%esp), %ebx; movl (5*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x748F82EE(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (3*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (2*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (1*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (7*4)(%esp);
+ movl ((((57 -15)&0xF)+8)*4)(%esp), %eax; movl ((((57 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((57 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((57 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((57)&0xF)+8)*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (6*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (5*4)(%esp), %ebx; movl (4*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x78A5636F(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (2*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (1*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (0*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (6*4)(%esp);
+ movl ((((58 -15)&0xF)+8)*4)(%esp), %eax; movl ((((58 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((58 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((58 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((58)&0xF)+8)*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (5*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (4*4)(%esp), %ebx; movl (3*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x84C87814(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (1*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (0*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (7*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (5*4)(%esp);
+ movl ((((59 -15)&0xF)+8)*4)(%esp), %eax; movl ((((59 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((59 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((59 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((59)&0xF)+8)*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (4*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (3*4)(%esp), %ebx; movl (2*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x8CC70208(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (0*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (7*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (6*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (4*4)(%esp);
+ movl ((((60 -15)&0xF)+8)*4)(%esp), %eax; movl ((((60 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((60 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((60 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((60)&0xF)+8)*4)(%esp); movl (0*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (3*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (2*4)(%esp), %ebx; movl (1*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0x90BEFFFA(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (7*4)(%esp); movl (4*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (6*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (5*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (3*4)(%esp);
+ movl ((((61 -15)&0xF)+8)*4)(%esp), %eax; movl ((((61 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((61 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((61 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((61)&0xF)+8)*4)(%esp); movl (7*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (2*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (1*4)(%esp), %ebx; movl (0*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xA4506CEB(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (6*4)(%esp); movl (3*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (5*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (4*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (2*4)(%esp);
+ movl ((((62 -15)&0xF)+8)*4)(%esp), %eax; movl ((((62 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((62 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((62 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((62)&0xF)+8)*4)(%esp); movl (6*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (1*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (0*4)(%esp), %ebx; movl (7*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xBEF9A3F7(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (5*4)(%esp); movl (2*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (4*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (3*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (1*4)(%esp);
+ movl ((((63 -15)&0xF)+8)*4)(%esp), %eax; movl ((((63 -16)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ebx; addl ((((63 - 7)&0xF)+8)*4)(%esp), %ebp; movl %eax, %ecx; rorl $18, %ebx; shrl $3, %ecx; rorl $7, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl ((((63 - 2)&0xF)+8)*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; rorl $19, %ebx; shrl $10, %ecx; rorl $17, %eax; xorl %ecx, %ebx; xorl %ebx, %eax; addl %eax, %ebp; movl %ebp, ((((63)&0xF)+8)*4)(%esp); movl (5*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $11, %eax; rorl $25, %ebx; rorl $6, %ecx; movl (0*4)(%esp), %esi; xorl %ebx, %eax; xorl %eax, %ecx; addl %ebp, %esi; movl (7*4)(%esp), %ebx; movl (6*4)(%esp), %eax; xorl %ebx, %eax; andl %edx, %eax; xorl %ebx, %eax; leal 0xC67178F2(%ecx,%eax), %ecx; addl %ecx, %esi; addl %esi, (4*4)(%esp); movl (1*4)(%esp), %eax; movl %eax, %ebx; movl %eax, %ecx; movl %eax, %edx; rorl $13, %eax; rorl $22, %ebx; rorl $2, %ecx; xorl %ebx, %eax; xorl %eax, %ecx; movl (3*4)(%esp), %eax; addl %ecx, %esi; movl %eax, %ecx; movl (2*4)(%esp), %ebx; orl %ebx, %ecx; andl %ebx, %eax; andl %edx, %ecx; orl %eax, %ecx; addl %ecx, %esi; movl %esi, (0*4)(%esp);
+
+
+ movl 116(%esp), %esi
+ movl 0(%esp), %eax; addl %eax, 0(%esi)
+ movl 4(%esp), %eax; addl %eax, 4(%esi)
+ movl 8(%esp), %eax; addl %eax, 8(%esi)
+ movl 12(%esp), %eax; addl %eax, 12(%esi)
+ movl 16(%esp), %eax; addl %eax, 16(%esi)
+ movl 20(%esp), %eax; addl %eax, 20(%esi)
+ movl 24(%esp), %eax; addl %eax, 24(%esi)
+ movl 28(%esp), %eax; addl %eax, 28(%esi)
+
+
+ movl 96(%esp), %ebx
+ movl 100(%esp), %esi
+ movl 104(%esp), %edi
+ movl 108(%esp), %ebp
+ addl $112, %esp
+ .ifdef MS_STDCALL
+ ret $8
+ .else
+ retl
+ .endif
+
+ .ifndef WINABI
+#if defined(__linux__) && defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+#endif
+ .endif \ No newline at end of file
diff --git a/src/Crypto/sha256_avx1_x64.asm b/src/Crypto/sha256_avx1_x64.asm
new file mode 100644
index 00000000..5c4ce559
--- /dev/null
+++ b/src/Crypto/sha256_avx1_x64.asm
@@ -0,0 +1,596 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the
+; distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived from
+; this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
+; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded
+; and search for that title.
+; The paper is expected to be released roughly at the end of April, 2012
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; addm [mem], reg
+; Add reg to mem using reg-mem add and store
+%macro addm 2
+ add %2, %1
+ mov %1, %2
+%endm
+
+%macro MY_ROR 2
+ shld %1,%1,(32-(%2))
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+ VMOVDQ %1, %2
+ vpshufb %1, %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XTMP4 xmm8
+%define XFER xmm9
+%define XTMP5 xmm11
+
+%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK xmm13
+
+%ifndef WINABI
+%define NUM_BLKS rdx ; 3rd arg
+%define CTX rsi ; 2nd arg
+%define INP rdi ; 1st arg
+
+%define SRND rdi ; clobbers INP
+%define c ecx
+%define d r8d
+%define e edx
+%else
+%define NUM_BLKS r8 ; 3rd arg
+%define CTX rdx ; 2nd arg
+%define INP rcx ; 1st arg
+
+%define SRND rcx ; clobbers INP
+%define c edi
+%define d esi
+%define e r8d
+
+%endif
+%define TBL rbp
+%define a eax
+%define b ebx
+
+%define f r9d
+%define g r10d
+%define h r11d
+
+%define y0 r13d
+%define y1 r14d
+%define y2 r15d
+
+
+_INP_END_SIZE equ 8
+_INP_SIZE equ 8
+_XFER_SIZE equ 8
+%ifndef WINABI
+_XMM_SAVE_SIZE equ 0
+%else
+_XMM_SAVE_SIZE equ 8*16
+%endif
+; STACK_SIZE plus pushes must be an odd multiple of 8
+_ALIGN_SIZE equ 8
+
+_INP_END equ 0
+_INP equ _INP_END + _INP_END_SIZE
+_XFER equ _INP + _INP_SIZE
+_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
+STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+ ;; compute s0 four at a time and s1 two at a time
+ ;; compute W[-16] + W[-7] 4 at a time
+ ;vmovdqa XTMP0, X3
+ mov y0, e ; y0 = e
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ ;vmovdqa XTMP1, X1
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ;; compute s0
+ vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+
+
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
+
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+
+ vpsrld XTMP2, XTMP1, 7
+
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+
+ vpslld XTMP3, XTMP1, (32-7)
+
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+
+ vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
+
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+
+
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+
+ vpsrld XTMP2, XTMP1,18
+
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+
+ vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
+
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+
+ vpslld XTMP1, XTMP1, (32-18)
+
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+
+ vpxor XTMP3, XTMP3, XTMP1
+
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+
+ vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
+
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+
+ vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
+
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ ;; compute low s1
+ vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
+
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+
+ ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
+
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+
+ vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
+
+ xor y2, g ; y2 = f^g
+
+ vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
+
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+
+ vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
+
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpxor XTMP2, XTMP2, XTMP3
+ add y2, y0 ; y2 = S1 + CH
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
+ vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ ;; compute high s1
+ vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
+ mov y0, e ; y0 = e
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+
+ vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
+
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+
+ vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
+
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+
+ vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
+
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+
+ vpxor XTMP2, XTMP2, XTMP3
+
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
+ vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+ mov y0, e ; y0 = e
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and y2, e ; y2 = (f^g)&e
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+;; arg 3 : Num blocks
+section .text
+global sha256_avx
+align 32
+sha256_avx:
+ push rbx
+%ifdef WINABI
+ push rsi
+ push rdi
+%endif
+ push rbp
+ push r13
+ push r14
+ push r15
+
+ sub rsp,STACK_SIZE
+%ifdef WINABI
+ vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
+ vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
+ vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
+ vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
+ vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
+ vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
+ vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
+ vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
+%endif
+
+ shl NUM_BLKS, 6 ; convert to bytes
+ jz done_hash
+ add NUM_BLKS, INP ; pointer to end of data
+ mov [rsp + _INP_END], NUM_BLKS
+
+ ;; load initial digest
+ mov a,[4*0 + CTX]
+ mov b,[4*1 + CTX]
+ mov c,[4*2 + CTX]
+ mov d,[4*3 + CTX]
+ mov e,[4*4 + CTX]
+ mov f,[4*5 + CTX]
+ mov g,[4*6 + CTX]
+ mov h,[4*7 + CTX]
+
+ vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+ vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
+ vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
+
+loop0:
+ lea TBL,[K256 wrt rip]
+
+ ;; byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+
+ mov [rsp + _INP], INP
+
+ ;; schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov SRND, 3
+align 16
+loop1:
+ vpaddd XFER, X0, [TBL + 0*16]
+ vmovdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd XFER, X0, [TBL + 1*16]
+ vmovdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd XFER, X0, [TBL + 2*16]
+ vmovdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd XFER, X0, [TBL + 3*16]
+ vmovdqa [rsp + _XFER], XFER
+ add TBL, 4*16
+ FOUR_ROUNDS_AND_SCHED
+
+ sub SRND, 1
+ jne loop1
+
+ mov SRND, 2
+loop2:
+ vpaddd XFER, X0, [TBL + 0*16]
+ vmovdqa [rsp + _XFER], XFER
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vpaddd XFER, X1, [TBL + 1*16]
+ vmovdqa [rsp + _XFER], XFER
+ add TBL, 2*16
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vmovdqa X0, X2
+ vmovdqa X1, X3
+
+ sub SRND, 1
+ jne loop2
+
+
+ addm [4*0 + CTX],a
+ addm [4*1 + CTX],b
+ addm [4*2 + CTX],c
+ addm [4*3 + CTX],d
+ addm [4*4 + CTX],e
+ addm [4*5 + CTX],f
+ addm [4*6 + CTX],g
+ addm [4*7 + CTX],h
+
+ mov INP, [rsp + _INP]
+ add INP, 64
+ cmp INP, [rsp + _INP_END]
+ jne loop0
+
+done_hash:
+%ifdef WINABI
+ vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
+ vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
+ vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
+ vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
+ vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
+ vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
+ vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
+ vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
+%endif
+
+
+ add rsp, STACK_SIZE
+
+ pop r15
+ pop r14
+ pop r13
+ pop rbp
+%ifdef WINABI
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+
+ ret
+
+
+section .data
+align 64
+K256:
+ dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+; shuffle xDxC -> DC00
+_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha256_avx1_x86.asm b/src/Crypto/sha256_avx1_x86.asm
new file mode 100644
index 00000000..31c8bd0d
--- /dev/null
+++ b/src/Crypto/sha256_avx1_x86.asm
@@ -0,0 +1,10 @@
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha256_avx2_x64.asm b/src/Crypto/sha256_avx2_x64.asm
new file mode 100644
index 00000000..458c2945
--- /dev/null
+++ b/src/Crypto/sha256_avx2_x64.asm
@@ -0,0 +1,840 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the
+; distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived from
+; this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx2_rorx2.obj -g cv8 sha256_avx2_rorx2.asm
+; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx2_rorx2.o sha256_avx2_rorx2.asm
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded
+; and search for that title.
+; The paper is expected to be released roughly at the end of April, 2012
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 2 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Modified by kerukuro for use in cppcrypto.
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; addm [mem], reg
+; Add reg to mem using reg-mem add and store
+%macro addm 2
+ add %2, %1
+ mov %1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 ymm4
+%define X1 ymm5
+%define X2 ymm6
+%define X3 ymm7
+
+; XMM versions of above
+%define XWORD0 xmm4
+%define XWORD1 xmm5
+%define XWORD2 xmm6
+%define XWORD3 xmm7
+
+%define XTMP0 ymm0
+%define XTMP1 ymm1
+%define XTMP2 ymm2
+%define XTMP3 ymm3
+%define XTMP4 ymm8
+%define XFER ymm9
+%define XTMP5 ymm11
+
+%define SHUF_00BA ymm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00 ymm12 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK ymm13
+
+%define X_BYTE_FLIP_MASK xmm13 ; XMM version of BYTE_FLIP_MASK
+
+%ifndef WINABI
+%define NUM_BLKS rdx ; 3rd arg
+%define CTX rsi ; 2nd arg
+%define INP rdi ; 1st arg
+%define c ecx
+%define d r8d
+%define e edx ; clobbers NUM_BLKS
+%define y3 edi ; clobbers INP
+%else
+%define NUM_BLKS r8 ; 3rd arg
+%define CTX rdx ; 2nd arg
+%define INP rcx ; 1st arg
+%define c edi
+%define d esi
+%define e r8d ; clobbers NUM_BLKS
+%define y3 ecx ; clobbers INP
+
+%endif
+
+
+%define TBL rbp
+%define SRND CTX ; SRND is same register as CTX
+
+%define a eax
+%define b ebx
+%define f r9d
+%define g r10d
+%define h r11d
+%define old_h r11d
+
+%define T1 r12d
+%define y0 r13d
+%define y1 r14d
+%define y2 r15d
+
+
+_XFER_SIZE equ 2*64*4 ; 2 blocks, 64 rounds, 4 bytes/round
+%ifndef WINABI
+_XMM_SAVE_SIZE equ 0
+%else
+_XMM_SAVE_SIZE equ 8*16
+%endif
+_INP_END_SIZE equ 8
+_INP_SIZE equ 8
+_CTX_SIZE equ 8
+_RSP_SIZE equ 8
+
+_XFER equ 0
+_XMM_SAVE equ _XFER + _XFER_SIZE
+_INP_END equ _XMM_SAVE + _XMM_SAVE_SIZE
+_INP equ _INP_END + _INP_END_SIZE
+_CTX equ _INP + _INP_SIZE
+_RSP equ _CTX + _CTX_SIZE
+STACK_SIZE equ _RSP + _RSP_SIZE
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine old_h h
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 1
+%define %%XFER %1
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov y3, a ; y3 = a ; MAJA
+ rorx y0, e, 25 ; y0 = e >> 25 ; S1A
+ rorx y1, e, 11 ; y1 = e >> 11 ; S1B
+
+ add h, dword[%%XFER+0*4] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+ vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
+ mov y2, f ; y2 = f ; CH
+ rorx T1, a, 13 ; T1 = a >> 13 ; S0B
+
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
+ xor y2, g ; y2 = f^g ; CH
+ vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1
+ rorx y1, e, 6 ; y1 = (e >> 6) ; S1
+
+ and y2, e ; y2 = (f^g)&e ; CH
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
+ rorx y1, a, 22 ; y1 = a >> 22 ; S0A
+ add d, h ; d = k + w + h + d ; --
+
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
+ rorx T1, a, 2 ; T1 = (a >> 2) ; S0
+
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+ vpsrld XTMP2, XTMP1, 7
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and T1, c ; T1 = a&c ; MAJB
+
+ add y2, y0 ; y2 = S1 + CH ; --
+ vpslld XTMP3, XTMP1, (32-7)
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+ vpor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] ror 7
+
+ vpsrld XTMP2, XTMP1,18
+ add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+ add h, y3 ; h = t1 + S0 + MAJ ; --
+
+
+ROTATE_ARGS
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ mov y3, a ; y3 = a ; MAJA
+ rorx y0, e, 25 ; y0 = e >> 25 ; S1A
+ rorx y1, e, 11 ; y1 = e >> 11 ; S1B
+ add h, dword[%%XFER+1*4] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+
+ vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
+ mov y2, f ; y2 = f ; CH
+ rorx T1, a, 13 ; T1 = a >> 13 ; S0B
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
+ xor y2, g ; y2 = f^g ; CH
+
+
+ rorx y1, e, 6 ; y1 = (e >> 6) ; S1
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
+ rorx y1, a, 22 ; y1 = a >> 22 ; S0A
+ and y2, e ; y2 = (f^g)&e ; CH
+ add d, h ; d = k + w + h + d ; --
+
+ vpslld XTMP1, XTMP1, (32-18)
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
+
+ vpxor XTMP3, XTMP3, XTMP1
+ rorx T1, a, 2 ; T1 = (a >> 2) ; S0
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+
+ vpxor XTMP3, XTMP3, XTMP2 ; XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+ vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
+ vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+ add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+ add h, y3 ; h = t1 + S0 + MAJ ; --
+
+ vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
+
+
+ROTATE_ARGS
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov y3, a ; y3 = a ; MAJA
+ rorx y0, e, 25 ; y0 = e >> 25 ; S1A
+ add h, [%%XFER+2*4] ; h = k + w + h ; --
+
+ vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
+ rorx y1, e, 11 ; y1 = e >> 11 ; S1B
+ or y3, c ; y3 = a|c ; MAJA
+ mov y2, f ; y2 = f ; CH
+ xor y2, g ; y2 = f^g ; CH
+
+ rorx T1, a, 13 ; T1 = a >> 13 ; S0B
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
+ vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
+ and y2, e ; y2 = (f^g)&e ; CH
+
+ rorx y1, e, 6 ; y1 = (e >> 6) ; S1
+ vpxor XTMP2, XTMP2, XTMP3
+ add d, h ; d = k + w + h + d ; --
+ and y3, b ; y3 = (a|c)&b ; MAJA
+
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
+ rorx y1, a, 22 ; y1 = a >> 22 ; S0A
+ vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+
+ vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
+ rorx T1, a, 2 ; T1 = (a >> 2) ; S0
+ vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
+
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+ vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
+
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+ add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+
+ add h, y3 ; h = t1 + S0 + MAJ ; --
+
+
+ROTATE_ARGS
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov y3, a ; y3 = a ; MAJA
+ rorx y0, e, 25 ; y0 = e >> 25 ; S1A
+ rorx y1, e, 11 ; y1 = e >> 11 ; S1B
+ add h, dword[%%XFER+3*4] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+
+ vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
+ mov y2, f ; y2 = f ; CH
+ rorx T1, a, 13 ; T1 = a >> 13 ; S0B
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
+ xor y2, g ; y2 = f^g ; CH
+
+
+ vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
+ rorx y1, e, 6 ; y1 = (e >> 6) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+ add d, h ; d = k + w + h + d ; --
+ and y3, b ; y3 = (a|c)&b ; MAJA
+
+ vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+
+ vpxor XTMP2, XTMP2, XTMP3
+ rorx y1, a, 22 ; y1 = a >> 22 ; S0A
+ add y2, y0 ; y2 = S1 + CH ; --
+
+ vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+ rorx T1, a, 2 ; T1 = (a >> 2) ; S0
+ vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
+
+ vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and T1, c ; T1 = a&c ; MAJB
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+
+ add h, y1 ; h = k + w + h + S0 ; --
+ add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+ add h, y3 ; h = t1 + S0 + MAJ ; --
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+%macro DO_4ROUNDS 1
+%define %%XFER %1
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov y2, f ; y2 = f ; CH
+ rorx y0, e, 25 ; y0 = e >> 25 ; S1A
+ rorx y1, e, 11 ; y1 = e >> 11 ; S1B
+ xor y2, g ; y2 = f^g ; CH
+
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
+ rorx y1, e, 6 ; y1 = (e >> 6) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
+ rorx T1, a, 13 ; T1 = a >> 13 ; S0B
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+ rorx y1, a, 22 ; y1 = a >> 22 ; S0A
+ mov y3, a ; y3 = a ; MAJA
+
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
+ rorx T1, a, 2 ; T1 = (a >> 2) ; S0
+ add h, dword[%%XFER + 4*0] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+
+ add d, h ; d = k + w + h + d ; --
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+
+ ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+
+ ;add h, y3 ; h = t1 + S0 + MAJ ; --
+
+ ROTATE_ARGS
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+ mov y2, f ; y2 = f ; CH
+ rorx y0, e, 25 ; y0 = e >> 25 ; S1A
+ rorx y1, e, 11 ; y1 = e >> 11 ; S1B
+ xor y2, g ; y2 = f^g ; CH
+
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
+ rorx y1, e, 6 ; y1 = (e >> 6) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+ add old_h, y3 ; h = t1 + S0 + MAJ ; --
+
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
+ rorx T1, a, 13 ; T1 = a >> 13 ; S0B
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+ rorx y1, a, 22 ; y1 = a >> 22 ; S0A
+ mov y3, a ; y3 = a ; MAJA
+
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
+ rorx T1, a, 2 ; T1 = (a >> 2) ; S0
+ add h, dword[%%XFER + 4*1] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+
+ add d, h ; d = k + w + h + d ; --
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+
+ ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+
+ ;add h, y3 ; h = t1 + S0 + MAJ ; --
+
+ ROTATE_ARGS
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+ mov y2, f ; y2 = f ; CH
+ rorx y0, e, 25 ; y0 = e >> 25 ; S1A
+ rorx y1, e, 11 ; y1 = e >> 11 ; S1B
+ xor y2, g ; y2 = f^g ; CH
+
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
+ rorx y1, e, 6 ; y1 = (e >> 6) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+ add old_h, y3 ; h = t1 + S0 + MAJ ; --
+
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
+ rorx T1, a, 13 ; T1 = a >> 13 ; S0B
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+ rorx y1, a, 22 ; y1 = a >> 22 ; S0A
+ mov y3, a ; y3 = a ; MAJA
+
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
+ rorx T1, a, 2 ; T1 = (a >> 2) ; S0
+ add h, dword[%%XFER + 4*2] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+
+ add d, h ; d = k + w + h + d ; --
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+
+ ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+
+ ;add h, y3 ; h = t1 + S0 + MAJ ; --
+
+ ROTATE_ARGS
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+ mov y2, f ; y2 = f ; CH
+ rorx y0, e, 25 ; y0 = e >> 25 ; S1A
+ rorx y1, e, 11 ; y1 = e >> 11 ; S1B
+ xor y2, g ; y2 = f^g ; CH
+
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ; S1
+ rorx y1, e, 6 ; y1 = (e >> 6) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+ add old_h, y3 ; h = t1 + S0 + MAJ ; --
+
+ xor y0, y1 ; y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1
+ rorx T1, a, 13 ; T1 = a >> 13 ; S0B
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+ rorx y1, a, 22 ; y1 = a >> 22 ; S0A
+ mov y3, a ; y3 = a ; MAJA
+
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ; S0
+ rorx T1, a, 2 ; T1 = (a >> 2) ; S0
+ add h, dword[%%XFER + 4*3] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+ xor y1, T1 ; y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+
+ add d, h ; d = k + w + h + d ; --
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+
+ add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; --
+
+ add h, y3 ; h = t1 + S0 + MAJ ; --
+
+ ROTATE_ARGS
+
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+;; arg 3 : Num blocks
+section .text
+global sha256_rorx
+global _sha256_rorx
+align 32
+sha256_rorx:
+_sha256_rorx:
+ push rbx
+%ifdef WINABI
+ push rsi
+ push rdi
+%endif
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov rax, rsp
+ sub rsp,STACK_SIZE
+ and rsp, -32
+ mov [rsp + _RSP], rax
+
+%ifdef WINABI
+ vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
+ vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
+ vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
+ vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
+ vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
+ vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
+ vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
+ vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
+%endif
+
+ shl NUM_BLKS, 6 ; convert to bytes
+ jz done_hash
+ lea NUM_BLKS, [NUM_BLKS + INP - 64] ; pointer to last block
+ mov [rsp + _INP_END], NUM_BLKS
+
+ cmp INP, NUM_BLKS
+ je only_one_block
+
+ ;; load initial digest
+ mov a,[4*0 + CTX]
+ mov b,[4*1 + CTX]
+ mov c,[4*2 + CTX]
+ mov d,[4*3 + CTX]
+ mov e,[4*4 + CTX]
+ mov f,[4*5 + CTX]
+ mov g,[4*6 + CTX]
+ mov h,[4*7 + CTX]
+
+ vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+ vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
+ vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
+
+ mov [rsp + _CTX], CTX
+
+loop0:
+ lea TBL,[K256 wrt rip]
+
+ ;; Load first 16 dwords from two blocks
+ VMOVDQ XTMP0, [INP + 0*32]
+ VMOVDQ XTMP1, [INP + 1*32]
+ VMOVDQ XTMP2, [INP + 2*32]
+ VMOVDQ XTMP3, [INP + 3*32]
+
+ ;; byte swap data
+ vpshufb XTMP0, XTMP0, BYTE_FLIP_MASK
+ vpshufb XTMP1, XTMP1, BYTE_FLIP_MASK
+ vpshufb XTMP2, XTMP2, BYTE_FLIP_MASK
+ vpshufb XTMP3, XTMP3, BYTE_FLIP_MASK
+
+ ;; transpose data into high/low halves
+ vperm2i128 X0, XTMP0, XTMP2, 0x20
+ vperm2i128 X1, XTMP0, XTMP2, 0x31
+ vperm2i128 X2, XTMP1, XTMP3, 0x20
+ vperm2i128 X3, XTMP1, XTMP3, 0x31
+
+last_block_enter:
+ add INP, 64
+ mov [rsp + _INP], INP
+
+ ;; schedule 48 input dwords, by doing 3 rounds of 12 each
+ xor SRND, SRND
+
+align 16
+loop1:
+ vpaddd XFER, X0, [TBL + SRND + 0*32]
+ vmovdqa [rsp + _XFER + SRND + 0*32], XFER
+ FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32
+
+ vpaddd XFER, X0, [TBL + SRND + 1*32]
+ vmovdqa [rsp + _XFER + SRND + 1*32], XFER
+ FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32
+
+ vpaddd XFER, X0, [TBL + SRND + 2*32]
+ vmovdqa [rsp + _XFER + SRND + 2*32], XFER
+ FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32
+
+ vpaddd XFER, X0, [TBL + SRND + 3*32]
+ vmovdqa [rsp + _XFER + SRND + 3*32], XFER
+ FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32
+
+ add SRND, 4*32
+ cmp SRND, 3 * 4*32
+ jb loop1
+
+loop2:
+ ;; Do last 16 rounds with no scheduling
+ vpaddd XFER, X0, [TBL + SRND + 0*32]
+ vmovdqa [rsp + _XFER + SRND + 0*32], XFER
+ DO_4ROUNDS rsp + _XFER + SRND + 0*32
+ vpaddd XFER, X1, [TBL + SRND + 1*32]
+ vmovdqa [rsp + _XFER + SRND + 1*32], XFER
+ DO_4ROUNDS rsp + _XFER + SRND + 1*32
+ add SRND, 2*32
+
+ vmovdqa X0, X2
+ vmovdqa X1, X3
+
+ cmp SRND, 4 * 4*32
+ jb loop2
+
+ mov CTX, [rsp + _CTX]
+ mov INP, [rsp + _INP]
+
+ addm [4*0 + CTX],a
+ addm [4*1 + CTX],b
+ addm [4*2 + CTX],c
+ addm [4*3 + CTX],d
+ addm [4*4 + CTX],e
+ addm [4*5 + CTX],f
+ addm [4*6 + CTX],g
+ addm [4*7 + CTX],h
+
+ cmp INP, [rsp + _INP_END]
+ ja done_hash
+
+ ;;;; Do second block using previously scheduled results
+ xor SRND, SRND
+align 16
+loop3:
+ DO_4ROUNDS rsp + _XFER + SRND + 0*32 + 16
+ DO_4ROUNDS rsp + _XFER + SRND + 1*32 + 16
+ add SRND, 2*32
+ cmp SRND, 4 * 4*32
+ jb loop3
+
+ mov CTX, [rsp + _CTX]
+ mov INP, [rsp + _INP]
+ add INP, 64
+
+ addm [4*0 + CTX],a
+ addm [4*1 + CTX],b
+ addm [4*2 + CTX],c
+ addm [4*3 + CTX],d
+ addm [4*4 + CTX],e
+ addm [4*5 + CTX],f
+ addm [4*6 + CTX],g
+ addm [4*7 + CTX],h
+
+ cmp INP, [rsp + _INP_END]
+ jb loop0
+ ja done_hash
+
+do_last_block:
+ ;;;; do last block
+ lea TBL,[K256 wrt rip]
+
+ VMOVDQ XWORD0, [INP + 0*16]
+ VMOVDQ XWORD1, [INP + 1*16]
+ VMOVDQ XWORD2, [INP + 2*16]
+ VMOVDQ XWORD3, [INP + 3*16]
+
+ vpshufb XWORD0, XWORD0, X_BYTE_FLIP_MASK
+ vpshufb XWORD1, XWORD1, X_BYTE_FLIP_MASK
+ vpshufb XWORD2, XWORD2, X_BYTE_FLIP_MASK
+ vpshufb XWORD3, XWORD3, X_BYTE_FLIP_MASK
+
+ jmp last_block_enter
+
+only_one_block:
+
+ ;; load initial digest
+ mov a,[4*0 + CTX]
+ mov b,[4*1 + CTX]
+ mov c,[4*2 + CTX]
+ mov d,[4*3 + CTX]
+ mov e,[4*4 + CTX]
+ mov f,[4*5 + CTX]
+ mov g,[4*6 + CTX]
+ mov h,[4*7 + CTX]
+
+ vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+ vmovdqa SHUF_00BA, [_SHUF_00BA wrt rip]
+ vmovdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
+
+ mov [rsp + _CTX], CTX
+ jmp do_last_block
+
+done_hash:
+%ifdef WINABI
+ vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
+ vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
+ vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
+ vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
+ vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
+ vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
+ vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
+ vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
+%endif
+
+ mov rsp, [rsp + _RSP]
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+%ifdef WINABI
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+
+ ret
+
+section .data
+align 64
+K256:
+ dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+ ddq 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA:
+ ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+; shuffle xDxC -> DC00
+_SHUF_DC00:
+ ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha256_avx2_x86.asm b/src/Crypto/sha256_avx2_x86.asm
new file mode 100644
index 00000000..31c8bd0d
--- /dev/null
+++ b/src/Crypto/sha256_avx2_x86.asm
@@ -0,0 +1,10 @@
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha256_sse4_x64.asm b/src/Crypto/sha256_sse4_x64.asm
new file mode 100644
index 00000000..c11630bc
--- /dev/null
+++ b/src/Crypto/sha256_sse4_x64.asm
@@ -0,0 +1,560 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the
+; distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived from
+; this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
+; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded
+; and search for that title.
+; The paper is expected to be released roughly at the end of April, 2012
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Modified by kerukuro for use in cppcrypto.
+
+; Modified By Mounir IDRASSI for use in VeraCrypt
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; addm [mem], reg
+; Add reg to mem using reg-mem add and store
+%macro addm 2
+ add %2, %1
+ mov %1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+ MOVDQ %1, %2
+ pshufb %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XTMP4 xmm8
+%define XFER xmm9
+
+%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK xmm12
+
+%ifndef WINABI
+%define NUM_BLKS rdx ; 3rd arg
+%define CTX rsi ; 2nd arg
+%define INP rdi ; 1st arg
+
+%define SRND rdi ; clobbers INP
+%define c ecx
+%define d r8d
+%define e edx
+%else
+%define NUM_BLKS r8 ; 3rd arg
+%define CTX rdx ; 2nd arg
+%define INP rcx ; 1st arg
+
+%define SRND rcx ; clobbers INP
+%define c edi
+%define d esi
+%define e r8d
+
+%endif
+%define TBL rbp
+%define a eax
+%define b ebx
+
+%define f r9d
+%define g r10d
+%define h r11d
+
+%define y0 r13d
+%define y1 r14d
+%define y2 r15d
+
+
+
+_INP_END_SIZE equ 8
+_INP_SIZE equ 8
+_XFER_SIZE equ 8
+%ifndef WINABI
+_XMM_SAVE_SIZE equ 0
+%else
+_XMM_SAVE_SIZE equ 7*16
+%endif
+; STACK_SIZE plus pushes must be an odd multiple of 8
+_ALIGN_SIZE equ 8
+
+_INP_END equ 0
+_INP equ _INP_END + _INP_END_SIZE
+_XFER equ _INP + _INP_SIZE
+_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
+STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+ ;; compute s0 four at a time and s1 two at a time
+ ;; compute W[-16] + W[-7] 4 at a time
+ movdqa XTMP0, X3
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ movdqa XTMP1, X1
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ;; compute s0
+ palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
+ movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pslld XTMP1, (32-7)
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ psrld XTMP2, 7
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ pslld XTMP3, (32-18)
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ psrld XTMP2, 18
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP1, XTMP3
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pxor XTMP1, XTMP4 ; XTMP1 = s0
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ ;; compute low s1
+ pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
+ xor y2, g ; y2 = f^g
+ psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP2, XTMP3
+ add y2, y0 ; y2 = S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
+ pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ ;; compute high s1
+ pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ pxor XTMP2, XTMP3
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
+ pxor X0, XTMP2 ; X0 = s1 {xDxC}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and y2, e ; y2 = (f^g)&e
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+;; arg 3 : Num blocks
+section .text
+global sha256_sse4
+global _sha256_sse4
+align 32
+sha256_sse4:
+_sha256_sse4:
+ push rbx
+%ifdef WINABI
+ push rsi
+ push rdi
+%endif
+ push rbp
+ push r13
+ push r14
+ push r15
+
+ sub rsp,STACK_SIZE
+%ifdef WINABI
+ movdqa [rsp + _XMM_SAVE + 0*16],xmm6
+ movdqa [rsp + _XMM_SAVE + 1*16],xmm7
+ movdqa [rsp + _XMM_SAVE + 2*16],xmm8
+ movdqa [rsp + _XMM_SAVE + 3*16],xmm9
+ movdqa [rsp + _XMM_SAVE + 4*16],xmm10
+ movdqa [rsp + _XMM_SAVE + 5*16],xmm11
+ movdqa [rsp + _XMM_SAVE + 6*16],xmm12
+%endif
+
+ shl NUM_BLKS, 6 ; convert to bytes
+ jz done_hash
+ add NUM_BLKS, INP ; pointer to end of data
+ mov [rsp + _INP_END], NUM_BLKS
+
+ ;; load initial digest
+ mov a,[4*0 + CTX]
+ mov b,[4*1 + CTX]
+ mov c,[4*2 + CTX]
+ mov d,[4*3 + CTX]
+ mov e,[4*4 + CTX]
+ mov f,[4*5 + CTX]
+ mov g,[4*6 + CTX]
+ mov h,[4*7 + CTX]
+
+ movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+ movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
+ movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
+
+loop0:
+ lea TBL,[K256 wrt rip]
+
+ ;; byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+
+ mov [rsp + _INP], INP
+
+ ;; schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov SRND, 3
+align 16
+loop1:
+ movdqa XFER, [TBL + 0*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 1*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 2*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 3*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ add TBL, 4*16
+ FOUR_ROUNDS_AND_SCHED
+
+ sub SRND, 1
+ jne loop1
+
+ mov SRND, 2
+loop2:
+ paddd X0, [TBL + 0*16]
+ movdqa [rsp + _XFER], X0
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+ paddd X1, [TBL + 1*16]
+ movdqa [rsp + _XFER], X1
+ add TBL, 2*16
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ movdqa X0, X2
+ movdqa X1, X3
+
+ sub SRND, 1
+ jne loop2
+
+ addm [4*0 + CTX],a
+ addm [4*1 + CTX],b
+ addm [4*2 + CTX],c
+ addm [4*3 + CTX],d
+ addm [4*4 + CTX],e
+ addm [4*5 + CTX],f
+ addm [4*6 + CTX],g
+ addm [4*7 + CTX],h
+
+ mov INP, [rsp + _INP]
+ add INP, 64
+ cmp INP, [rsp + _INP_END]
+ jne loop0
+
+done_hash:
+%ifdef WINABI
+ movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
+ movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
+ movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
+ movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
+ movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
+ movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
+ movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
+%endif
+
+ add rsp, STACK_SIZE
+
+ pop r15
+ pop r14
+ pop r13
+ pop rbp
+%ifdef WINABI
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+
+ ret
+
+
+section .data
+align 64
+K256:
+ dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+; shuffle xDxC -> DC00
+_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha256_sse4_x86.asm b/src/Crypto/sha256_sse4_x86.asm
new file mode 100644
index 00000000..31c8bd0d
--- /dev/null
+++ b/src/Crypto/sha256_sse4_x86.asm
@@ -0,0 +1,10 @@
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha512-x64-nayuki.S b/src/Crypto/sha512-x64-nayuki.S
new file mode 100644
index 00000000..0e36ac91
--- /dev/null
+++ b/src/Crypto/sha512-x64-nayuki.S
@@ -0,0 +1,202 @@
+/*
+ * SHA-512 hash in x86-64 assembly
+ *
+ * Copyright (c) 2017 Project Nayuki. (MIT License)
+ * https://www.nayuki.io/page/fast-sha2-hashes-in-x86-assembly
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * - The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * - The Software is provided "as is", without warranty of any kind, express or
+ * implied, including but not limited to the warranties of merchantability,
+ * fitness for a particular purpose and noninfringement. In no event shall the
+ * authors or copyright holders be liable for any claim, damages or other
+ * liability, whether in an action of contract, tort or otherwise, arising from,
+ * out of or in connection with the Software or the use or other dealings in the
+ * Software.
+ */
+
+# Adapted for VeraCrypt
+# Adapt to Windows calling convention when building on Windows.
+# avoid using xmm6 register since it must be preserved on Windows. We use MMX registers instead.
+
+
+/* void sha512_compress_nayuki(uint64_t state[8], const uint8_t block[128]) */
+.globl sha512_compress_nayuki
+.globl _sha512_compress_nayuki
+sha512_compress_nayuki:
+_sha512_compress_nayuki:
+ /*
+ * Storage usage:
+ * Bytes Location Description
+ * 8 rax Temporary for calculation per round
+ * 8 rbx Temporary for calculation per round
+ * 8 rcx Temporary for calculation per round
+ * 8 rdx Temporary for calculation per round
+ * 8 rsi Base address of block array argument (read-only)
+ * 8 rdi Base address of state array argument (read-only)
+ * 8 rsp x86-64 stack pointer
+ * 8 r8 SHA-512 state variable A
+ * 8 r9 SHA-512 state variable B
+ * 8 r10 SHA-512 state variable C
+ * 8 r11 SHA-512 state variable D
+ * 8 r12 SHA-512 state variable E
+ * 8 r13 SHA-512 state variable F
+ * 8 r14 SHA-512 state variable G
+ * 8 r15 SHA-512 state variable H
+ * 128 [rsp+0] Circular buffer of most recent 16 key schedule items, 8 bytes each
+ * 16 xmm0 Caller's value of r10 (only low 64 bits are used)
+ * 16 xmm1 Caller's value of r11 (only low 64 bits are used)
+ * 16 xmm2 Caller's value of r12 (only low 64 bits are used)
+ * 16 xmm3 Caller's value of r13 (only low 64 bits are used)
+ * 16 xmm4 Caller's value of r14 (only low 64 bits are used)
+ * 16 xmm5 Caller's value of r15 (only low 64 bits are used)
+ * 8 mm0 Caller's value of rbx
+ */
+ movq %r10, %xmm0
+ movq %r11, %xmm1
+ movq %r12, %xmm2
+ movq %r13, %xmm3
+ movq %r14, %xmm4
+ movq %r15, %xmm5
+ movq %rbx, %mm0
+.ifdef WINABI
+ movq %rdi, %mm1
+ movq %rsi, %mm2
+ movq %rcx, %rdi
+ movq %rdx, %rsi
+.endif
+ subq $128, %rsp
+
+
+ movq 0(%rdi), %r8
+ movq 8(%rdi), %r9
+ movq 16(%rdi), %r10
+ movq 24(%rdi), %r11
+ movq 32(%rdi), %r12
+ movq 40(%rdi), %r13
+ movq 48(%rdi), %r14
+ movq 56(%rdi), %r15
+
+
+ movq (0*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((0)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x428A2F98D728AE22, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15;
+ movq (1*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((1)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x7137449123EF65CD, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14;
+ movq (2*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((2)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0xB5C0FBCFEC4D3B2F, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13;
+ movq (3*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((3)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0xE9B5DBA58189DBBC, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12;
+ movq (4*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((4)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x3956C25BF348B538, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11;
+ movq (5*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((5)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x59F111F1B605D019, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10;
+ movq (6*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((6)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x923F82A4AF194F9B, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9;
+ movq (7*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((7)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0xAB1C5ED5DA6D8118, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8;
+ movq (8*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((8)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0xD807AA98A3030242, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15;
+ movq (9*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((9)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x12835B0145706FBE, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14;
+ movq (10*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((10)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x243185BE4EE4B28C, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13;
+ movq (11*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((11)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x550C7DC3D5FFB4E2, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12;
+ movq (12*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((12)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x72BE5D74F27B896F, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11;
+ movq (13*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((13)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x80DEB1FE3B1696B1, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10;
+ movq (14*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((14)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x9BDC06A725C71235, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9;
+ movq (15*8)(%rsi), %rbx; bswapq %rbx; movq %rbx, (((15)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0xC19BF174CF692694, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8;
+ movq (((16 -15)&0xF)*8)(%rsp), %rax; movq (((16 -16)&0xF)*8)(%rsp), %rbx; addq (((16 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((16 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((16)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0xE49B69C19EF14AD2, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15;
+ movq (((17 -15)&0xF)*8)(%rsp), %rax; movq (((17 -16)&0xF)*8)(%rsp), %rbx; addq (((17 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((17 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((17)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0xEFBE4786384F25E3, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14;
+ movq (((18 -15)&0xF)*8)(%rsp), %rax; movq (((18 -16)&0xF)*8)(%rsp), %rbx; addq (((18 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((18 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((18)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x0FC19DC68B8CD5B5, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13;
+ movq (((19 -15)&0xF)*8)(%rsp), %rax; movq (((19 -16)&0xF)*8)(%rsp), %rbx; addq (((19 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((19 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((19)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x240CA1CC77AC9C65, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12;
+ movq (((20 -15)&0xF)*8)(%rsp), %rax; movq (((20 -16)&0xF)*8)(%rsp), %rbx; addq (((20 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((20 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((20)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x2DE92C6F592B0275, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11;
+ movq (((21 -15)&0xF)*8)(%rsp), %rax; movq (((21 -16)&0xF)*8)(%rsp), %rbx; addq (((21 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((21 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((21)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x4A7484AA6EA6E483, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10;
+ movq (((22 -15)&0xF)*8)(%rsp), %rax; movq (((22 -16)&0xF)*8)(%rsp), %rbx; addq (((22 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((22 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((22)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x5CB0A9DCBD41FBD4, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9;
+ movq (((23 -15)&0xF)*8)(%rsp), %rax; movq (((23 -16)&0xF)*8)(%rsp), %rbx; addq (((23 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((23 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((23)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x76F988DA831153B5, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8;
+ movq (((24 -15)&0xF)*8)(%rsp), %rax; movq (((24 -16)&0xF)*8)(%rsp), %rbx; addq (((24 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((24 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((24)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x983E5152EE66DFAB, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15;
+ movq (((25 -15)&0xF)*8)(%rsp), %rax; movq (((25 -16)&0xF)*8)(%rsp), %rbx; addq (((25 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((25 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((25)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0xA831C66D2DB43210, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14;
+ movq (((26 -15)&0xF)*8)(%rsp), %rax; movq (((26 -16)&0xF)*8)(%rsp), %rbx; addq (((26 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((26 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((26)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0xB00327C898FB213F, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13;
+ movq (((27 -15)&0xF)*8)(%rsp), %rax; movq (((27 -16)&0xF)*8)(%rsp), %rbx; addq (((27 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((27 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((27)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0xBF597FC7BEEF0EE4, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12;
+ movq (((28 -15)&0xF)*8)(%rsp), %rax; movq (((28 -16)&0xF)*8)(%rsp), %rbx; addq (((28 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((28 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((28)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0xC6E00BF33DA88FC2, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11;
+ movq (((29 -15)&0xF)*8)(%rsp), %rax; movq (((29 -16)&0xF)*8)(%rsp), %rbx; addq (((29 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((29 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((29)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0xD5A79147930AA725, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10;
+ movq (((30 -15)&0xF)*8)(%rsp), %rax; movq (((30 -16)&0xF)*8)(%rsp), %rbx; addq (((30 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((30 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((30)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x06CA6351E003826F, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9;
+ movq (((31 -15)&0xF)*8)(%rsp), %rax; movq (((31 -16)&0xF)*8)(%rsp), %rbx; addq (((31 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((31 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((31)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x142929670A0E6E70, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8;
+ movq (((32 -15)&0xF)*8)(%rsp), %rax; movq (((32 -16)&0xF)*8)(%rsp), %rbx; addq (((32 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((32 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((32)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x27B70A8546D22FFC, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15;
+ movq (((33 -15)&0xF)*8)(%rsp), %rax; movq (((33 -16)&0xF)*8)(%rsp), %rbx; addq (((33 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((33 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((33)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x2E1B21385C26C926, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14;
+ movq (((34 -15)&0xF)*8)(%rsp), %rax; movq (((34 -16)&0xF)*8)(%rsp), %rbx; addq (((34 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((34 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((34)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x4D2C6DFC5AC42AED, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13;
+ movq (((35 -15)&0xF)*8)(%rsp), %rax; movq (((35 -16)&0xF)*8)(%rsp), %rbx; addq (((35 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((35 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((35)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x53380D139D95B3DF, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12;
+ movq (((36 -15)&0xF)*8)(%rsp), %rax; movq (((36 -16)&0xF)*8)(%rsp), %rbx; addq (((36 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((36 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((36)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x650A73548BAF63DE, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11;
+ movq (((37 -15)&0xF)*8)(%rsp), %rax; movq (((37 -16)&0xF)*8)(%rsp), %rbx; addq (((37 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((37 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((37)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x766A0ABB3C77B2A8, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10;
+ movq (((38 -15)&0xF)*8)(%rsp), %rax; movq (((38 -16)&0xF)*8)(%rsp), %rbx; addq (((38 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((38 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((38)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x81C2C92E47EDAEE6, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9;
+ movq (((39 -15)&0xF)*8)(%rsp), %rax; movq (((39 -16)&0xF)*8)(%rsp), %rbx; addq (((39 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((39 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((39)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x92722C851482353B, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8;
+ movq (((40 -15)&0xF)*8)(%rsp), %rax; movq (((40 -16)&0xF)*8)(%rsp), %rbx; addq (((40 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((40 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((40)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0xA2BFE8A14CF10364, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15;
+ movq (((41 -15)&0xF)*8)(%rsp), %rax; movq (((41 -16)&0xF)*8)(%rsp), %rbx; addq (((41 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((41 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((41)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0xA81A664BBC423001, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14;
+ movq (((42 -15)&0xF)*8)(%rsp), %rax; movq (((42 -16)&0xF)*8)(%rsp), %rbx; addq (((42 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((42 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((42)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0xC24B8B70D0F89791, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13;
+ movq (((43 -15)&0xF)*8)(%rsp), %rax; movq (((43 -16)&0xF)*8)(%rsp), %rbx; addq (((43 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((43 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((43)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0xC76C51A30654BE30, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12;
+ movq (((44 -15)&0xF)*8)(%rsp), %rax; movq (((44 -16)&0xF)*8)(%rsp), %rbx; addq (((44 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((44 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((44)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0xD192E819D6EF5218, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11;
+ movq (((45 -15)&0xF)*8)(%rsp), %rax; movq (((45 -16)&0xF)*8)(%rsp), %rbx; addq (((45 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((45 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((45)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0xD69906245565A910, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10;
+ movq (((46 -15)&0xF)*8)(%rsp), %rax; movq (((46 -16)&0xF)*8)(%rsp), %rbx; addq (((46 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((46 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((46)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0xF40E35855771202A, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9;
+ movq (((47 -15)&0xF)*8)(%rsp), %rax; movq (((47 -16)&0xF)*8)(%rsp), %rbx; addq (((47 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((47 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((47)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x106AA07032BBD1B8, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8;
+ movq (((48 -15)&0xF)*8)(%rsp), %rax; movq (((48 -16)&0xF)*8)(%rsp), %rbx; addq (((48 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((48 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((48)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x19A4C116B8D2D0C8, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15;
+ movq (((49 -15)&0xF)*8)(%rsp), %rax; movq (((49 -16)&0xF)*8)(%rsp), %rbx; addq (((49 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((49 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((49)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x1E376C085141AB53, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14;
+ movq (((50 -15)&0xF)*8)(%rsp), %rax; movq (((50 -16)&0xF)*8)(%rsp), %rbx; addq (((50 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((50 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((50)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x2748774CDF8EEB99, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13;
+ movq (((51 -15)&0xF)*8)(%rsp), %rax; movq (((51 -16)&0xF)*8)(%rsp), %rbx; addq (((51 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((51 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((51)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x34B0BCB5E19B48A8, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12;
+ movq (((52 -15)&0xF)*8)(%rsp), %rax; movq (((52 -16)&0xF)*8)(%rsp), %rbx; addq (((52 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((52 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((52)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x391C0CB3C5C95A63, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11;
+ movq (((53 -15)&0xF)*8)(%rsp), %rax; movq (((53 -16)&0xF)*8)(%rsp), %rbx; addq (((53 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((53 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((53)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x4ED8AA4AE3418ACB, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10;
+ movq (((54 -15)&0xF)*8)(%rsp), %rax; movq (((54 -16)&0xF)*8)(%rsp), %rbx; addq (((54 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((54 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((54)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x5B9CCA4F7763E373, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9;
+ movq (((55 -15)&0xF)*8)(%rsp), %rax; movq (((55 -16)&0xF)*8)(%rsp), %rbx; addq (((55 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((55 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((55)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x682E6FF3D6B2B8A3, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8;
+ movq (((56 -15)&0xF)*8)(%rsp), %rax; movq (((56 -16)&0xF)*8)(%rsp), %rbx; addq (((56 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((56 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((56)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x748F82EE5DEFB2FC, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15;
+ movq (((57 -15)&0xF)*8)(%rsp), %rax; movq (((57 -16)&0xF)*8)(%rsp), %rbx; addq (((57 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((57 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((57)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x78A5636F43172F60, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14;
+ movq (((58 -15)&0xF)*8)(%rsp), %rax; movq (((58 -16)&0xF)*8)(%rsp), %rbx; addq (((58 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((58 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((58)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x84C87814A1F0AB72, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13;
+ movq (((59 -15)&0xF)*8)(%rsp), %rax; movq (((59 -16)&0xF)*8)(%rsp), %rbx; addq (((59 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((59 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((59)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x8CC702081A6439EC, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12;
+ movq (((60 -15)&0xF)*8)(%rsp), %rax; movq (((60 -16)&0xF)*8)(%rsp), %rbx; addq (((60 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((60 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((60)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x90BEFFFA23631E28, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11;
+ movq (((61 -15)&0xF)*8)(%rsp), %rax; movq (((61 -16)&0xF)*8)(%rsp), %rbx; addq (((61 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((61 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((61)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0xA4506CEBDE82BDE9, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10;
+ movq (((62 -15)&0xF)*8)(%rsp), %rax; movq (((62 -16)&0xF)*8)(%rsp), %rbx; addq (((62 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((62 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((62)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0xBEF9A3F7B2C67915, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9;
+ movq (((63 -15)&0xF)*8)(%rsp), %rax; movq (((63 -16)&0xF)*8)(%rsp), %rbx; addq (((63 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((63 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((63)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0xC67178F2E372532B, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8;
+ movq (((64 -15)&0xF)*8)(%rsp), %rax; movq (((64 -16)&0xF)*8)(%rsp), %rbx; addq (((64 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((64 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((64)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0xCA273ECEEA26619C, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15;
+ movq (((65 -15)&0xF)*8)(%rsp), %rax; movq (((65 -16)&0xF)*8)(%rsp), %rbx; addq (((65 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((65 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((65)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0xD186B8C721C0C207, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14;
+ movq (((66 -15)&0xF)*8)(%rsp), %rax; movq (((66 -16)&0xF)*8)(%rsp), %rbx; addq (((66 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((66 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((66)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0xEADA7DD6CDE0EB1E, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13;
+ movq (((67 -15)&0xF)*8)(%rsp), %rax; movq (((67 -16)&0xF)*8)(%rsp), %rbx; addq (((67 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((67 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((67)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0xF57D4F7FEE6ED178, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12;
+ movq (((68 -15)&0xF)*8)(%rsp), %rax; movq (((68 -16)&0xF)*8)(%rsp), %rbx; addq (((68 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((68 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((68)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x06F067AA72176FBA, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11;
+ movq (((69 -15)&0xF)*8)(%rsp), %rax; movq (((69 -16)&0xF)*8)(%rsp), %rbx; addq (((69 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((69 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((69)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x0A637DC5A2C898A6, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10;
+ movq (((70 -15)&0xF)*8)(%rsp), %rax; movq (((70 -16)&0xF)*8)(%rsp), %rbx; addq (((70 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((70 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((70)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x113F9804BEF90DAE, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9;
+ movq (((71 -15)&0xF)*8)(%rsp), %rax; movq (((71 -16)&0xF)*8)(%rsp), %rbx; addq (((71 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((71 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((71)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x1B710B35131C471B, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8;
+ movq (((72 -15)&0xF)*8)(%rsp), %rax; movq (((72 -16)&0xF)*8)(%rsp), %rbx; addq (((72 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((72 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((72)&0xF)*8)(%rsp); movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r15; movq %r14, %rcx; xorq %r13, %rcx; andq %r12, %rcx; xorq %r14, %rcx; addq %rax, %r15; movabs $0x28DB77F523047D84, %rax; addq %rcx, %r15; addq %rax, %r15; addq %r15, %r11; movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r10, %rcx; addq %rax, %r15; movq %r10, %rax; orq %r9, %rax; andq %r9, %rcx; andq %r8, %rax; orq %rcx, %rax; addq %rax, %r15;
+ movq (((73 -15)&0xF)*8)(%rsp), %rax; movq (((73 -16)&0xF)*8)(%rsp), %rbx; addq (((73 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((73 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((73)&0xF)*8)(%rsp); movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r14; movq %r13, %rcx; xorq %r12, %rcx; andq %r11, %rcx; xorq %r13, %rcx; addq %rax, %r14; movabs $0x32CAAB7B40C72493, %rax; addq %rcx, %r14; addq %rax, %r14; addq %r14, %r10; movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r9, %rcx; addq %rax, %r14; movq %r9, %rax; orq %r8, %rax; andq %r8, %rcx; andq %r15, %rax; orq %rcx, %rax; addq %rax, %r14;
+ movq (((74 -15)&0xF)*8)(%rsp), %rax; movq (((74 -16)&0xF)*8)(%rsp), %rbx; addq (((74 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((74 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((74)&0xF)*8)(%rsp); movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r13; movq %r12, %rcx; xorq %r11, %rcx; andq %r10, %rcx; xorq %r12, %rcx; addq %rax, %r13; movabs $0x3C9EBE0A15C9BEBC, %rax; addq %rcx, %r13; addq %rax, %r13; addq %r13, %r9; movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r8, %rcx; addq %rax, %r13; movq %r8, %rax; orq %r15, %rax; andq %r15, %rcx; andq %r14, %rax; orq %rcx, %rax; addq %rax, %r13;
+ movq (((75 -15)&0xF)*8)(%rsp), %rax; movq (((75 -16)&0xF)*8)(%rsp), %rbx; addq (((75 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((75 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((75)&0xF)*8)(%rsp); movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r12; movq %r11, %rcx; xorq %r10, %rcx; andq %r9, %rcx; xorq %r11, %rcx; addq %rax, %r12; movabs $0x431D67C49C100D4C, %rax; addq %rcx, %r12; addq %rax, %r12; addq %r12, %r8; movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r15, %rcx; addq %rax, %r12; movq %r15, %rax; orq %r14, %rax; andq %r14, %rcx; andq %r13, %rax; orq %rcx, %rax; addq %rax, %r12;
+ movq (((76 -15)&0xF)*8)(%rsp), %rax; movq (((76 -16)&0xF)*8)(%rsp), %rbx; addq (((76 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((76 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((76)&0xF)*8)(%rsp); movq %r8, %rcx; movq %r8, %rdx; movq %r8, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r11; movq %r10, %rcx; xorq %r9, %rcx; andq %r8, %rcx; xorq %r10, %rcx; addq %rax, %r11; movabs $0x4CC5D4BECB3E42B6, %rax; addq %rcx, %r11; addq %rax, %r11; addq %r11, %r15; movq %r12, %rcx; movq %r12, %rdx; movq %r12, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r14, %rcx; addq %rax, %r11; movq %r14, %rax; orq %r13, %rax; andq %r13, %rcx; andq %r12, %rax; orq %rcx, %rax; addq %rax, %r11;
+ movq (((77 -15)&0xF)*8)(%rsp), %rax; movq (((77 -16)&0xF)*8)(%rsp), %rbx; addq (((77 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((77 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((77)&0xF)*8)(%rsp); movq %r15, %rcx; movq %r15, %rdx; movq %r15, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r10; movq %r9, %rcx; xorq %r8, %rcx; andq %r15, %rcx; xorq %r9, %rcx; addq %rax, %r10; movabs $0x597F299CFC657E2A, %rax; addq %rcx, %r10; addq %rax, %r10; addq %r10, %r14; movq %r11, %rcx; movq %r11, %rdx; movq %r11, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r13, %rcx; addq %rax, %r10; movq %r13, %rax; orq %r12, %rax; andq %r12, %rcx; andq %r11, %rax; orq %rcx, %rax; addq %rax, %r10;
+ movq (((78 -15)&0xF)*8)(%rsp), %rax; movq (((78 -16)&0xF)*8)(%rsp), %rbx; addq (((78 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((78 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((78)&0xF)*8)(%rsp); movq %r14, %rcx; movq %r14, %rdx; movq %r14, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r9; movq %r8, %rcx; xorq %r15, %rcx; andq %r14, %rcx; xorq %r8, %rcx; addq %rax, %r9; movabs $0x5FCB6FAB3AD6FAEC, %rax; addq %rcx, %r9; addq %rax, %r9; addq %r9, %r13; movq %r10, %rcx; movq %r10, %rdx; movq %r10, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r12, %rcx; addq %rax, %r9; movq %r12, %rax; orq %r11, %rax; andq %r11, %rcx; andq %r10, %rax; orq %rcx, %rax; addq %rax, %r9;
+ movq (((79 -15)&0xF)*8)(%rsp), %rax; movq (((79 -16)&0xF)*8)(%rsp), %rbx; addq (((79 - 7)&0xF)*8)(%rsp), %rbx; movq %rax, %rcx; movq %rax, %rdx; rorq $8, %rcx; shrq $7, %rdx; rorq $1, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq (((79 - 2)&0xF)*8)(%rsp), %rax; movq %rax, %rcx; movq %rax, %rdx; rorq $61, %rcx; shrq $6, %rdx; rorq $19, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rax, %rbx; movq %rbx, (((79)&0xF)*8)(%rsp); movq %r13, %rcx; movq %r13, %rdx; movq %r13, %rax; rorq $18, %rcx; rorq $41, %rdx; rorq $14, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; addq %rbx, %r8; movq %r15, %rcx; xorq %r14, %rcx; andq %r13, %rcx; xorq %r15, %rcx; addq %rax, %r8; movabs $0x6C44198C4A475817, %rax; addq %rcx, %r8; addq %rax, %r8; addq %r8, %r12; movq %r9, %rcx; movq %r9, %rdx; movq %r9, %rax; rorq $39, %rcx; rorq $34, %rdx; rorq $28, %rax; xorq %rdx, %rcx; xorq %rcx, %rax; movq %r11, %rcx; addq %rax, %r8; movq %r11, %rax; orq %r10, %rax; andq %r10, %rcx; andq %r9, %rax; orq %rcx, %rax; addq %rax, %r8;
+
+
+ addq %r8 , 0(%rdi)
+ addq %r9 , 8(%rdi)
+ addq %r10, 16(%rdi)
+ addq %r11, 24(%rdi)
+ addq %r12, 32(%rdi)
+ addq %r13, 40(%rdi)
+ addq %r14, 48(%rdi)
+ addq %r15, 56(%rdi)
+
+
+ movq %xmm0, %r10
+ movq %xmm1, %r11
+ movq %xmm2, %r12
+ movq %xmm3, %r13
+ movq %xmm4, %r14
+ movq %xmm5, %r15
+ movq %mm0, %rbx
+
+.ifdef WINABI
+ movq %mm1, %rdi
+ movq %mm2, %rsi
+.endif
+
+ emms
+
+ addq $128, %rsp
+
+ retq
+
+ .ifndef WINABI
+#if defined(__linux__) && defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+#endif
+ .endif
diff --git a/src/Crypto/sha512-x86-nayuki.S b/src/Crypto/sha512-x86-nayuki.S
new file mode 100644
index 00000000..dcbebf7a
--- /dev/null
+++ b/src/Crypto/sha512-x86-nayuki.S
@@ -0,0 +1,180 @@
+#
+# SHA-512 hash in x86 assembly
+#
+# Copyright (c) 2014 Project Nayuki
+# http://www.nayuki.io/page/fast-sha2-hashes-in-x86-assembly
+#
+# (MIT License)
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+# - The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# - The Software is provided "as is", without warranty of any kind, express or
+# implied, including but not limited to the warranties of merchantability,
+# fitness for a particular purpose and noninfringement. In no event shall the
+# authors or copyright holders be liable for any claim, damages or other
+# liability, whether in an action of contract, tort or otherwise, arising from,
+# out of or in connection with the Software or the use or other dealings in the
+# Software.
+#
+
+# Modified by kerukuro for use in cppcrypto.
+
+ .ifdef MS_STDCALL
+ .globl _sha512_compress_nayuki@8
+ _sha512_compress_nayuki@8:
+ .else
+ .globl _sha512_compress_nayuki
+ .globl sha512_compress_nayuki
+ _sha512_compress_nayuki:
+ sha512_compress_nayuki:
+ .endif
+
+ movl %esp, %ecx
+ subl $192, %esp
+ andl $~0xF, %esp
+
+
+ movl 4(%ecx), %eax
+ movdqu 0(%eax), %xmm0; movdqu %xmm0, 0(%esp)
+ movdqu 16(%eax), %xmm1; movdqu %xmm1, 16(%esp)
+ movdqu 32(%eax), %xmm2; movdqu %xmm2, 32(%esp)
+ movdqu 48(%eax), %xmm3; movdqu %xmm3, 48(%esp)
+
+
+ movl 8(%ecx), %eax
+ movq .bswap64, %mm7
+ movq (0*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((0)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+0*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp);
+ movq (1*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((1)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+1*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp);
+ movq (2*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((2)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+2*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp);
+ movq (3*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((3)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+3*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp);
+ movq (4*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((4)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+4*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp);
+ movq (5*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((5)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+5*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp);
+ movq (6*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((6)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+6*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp);
+ movq (7*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((7)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+7*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp);
+ movq (8*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((8)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+8*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp);
+ movq (9*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((9)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+9*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp);
+ movq (10*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((10)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+10*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp);
+ movq (11*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((11)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+11*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp);
+ movq (12*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((12)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+12*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp);
+ movq (13*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((13)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+13*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp);
+ movq (14*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((14)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+14*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp);
+ movq (15*8)(%eax), %mm0; pshufb %mm7, %mm0; movq %mm0, (((15)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+15*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp);
+ movq (((16 -16)&0xF)*8+64)(%esp), %mm0; paddq (((16 - 7)&0xF)*8+64)(%esp), %mm0; movq (((16 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((16 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((16)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+16*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp);
+ movq (((17 -16)&0xF)*8+64)(%esp), %mm0; paddq (((17 - 7)&0xF)*8+64)(%esp), %mm0; movq (((17 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((17 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((17)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+17*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp);
+ movq (((18 -16)&0xF)*8+64)(%esp), %mm0; paddq (((18 - 7)&0xF)*8+64)(%esp), %mm0; movq (((18 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((18 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((18)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+18*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp);
+ movq (((19 -16)&0xF)*8+64)(%esp), %mm0; paddq (((19 - 7)&0xF)*8+64)(%esp), %mm0; movq (((19 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((19 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((19)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+19*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp);
+ movq (((20 -16)&0xF)*8+64)(%esp), %mm0; paddq (((20 - 7)&0xF)*8+64)(%esp), %mm0; movq (((20 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((20 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((20)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+20*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp);
+ movq (((21 -16)&0xF)*8+64)(%esp), %mm0; paddq (((21 - 7)&0xF)*8+64)(%esp), %mm0; movq (((21 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((21 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((21)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+21*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp);
+ movq (((22 -16)&0xF)*8+64)(%esp), %mm0; paddq (((22 - 7)&0xF)*8+64)(%esp), %mm0; movq (((22 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((22 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((22)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+22*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp);
+ movq (((23 -16)&0xF)*8+64)(%esp), %mm0; paddq (((23 - 7)&0xF)*8+64)(%esp), %mm0; movq (((23 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((23 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((23)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+23*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp);
+ movq (((24 -16)&0xF)*8+64)(%esp), %mm0; paddq (((24 - 7)&0xF)*8+64)(%esp), %mm0; movq (((24 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((24 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((24)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+24*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp);
+ movq (((25 -16)&0xF)*8+64)(%esp), %mm0; paddq (((25 - 7)&0xF)*8+64)(%esp), %mm0; movq (((25 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((25 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((25)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+25*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp);
+ movq (((26 -16)&0xF)*8+64)(%esp), %mm0; paddq (((26 - 7)&0xF)*8+64)(%esp), %mm0; movq (((26 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((26 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((26)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+26*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp);
+ movq (((27 -16)&0xF)*8+64)(%esp), %mm0; paddq (((27 - 7)&0xF)*8+64)(%esp), %mm0; movq (((27 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((27 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((27)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+27*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp);
+ movq (((28 -16)&0xF)*8+64)(%esp), %mm0; paddq (((28 - 7)&0xF)*8+64)(%esp), %mm0; movq (((28 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((28 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((28)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+28*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp);
+ movq (((29 -16)&0xF)*8+64)(%esp), %mm0; paddq (((29 - 7)&0xF)*8+64)(%esp), %mm0; movq (((29 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((29 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((29)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+29*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp);
+ movq (((30 -16)&0xF)*8+64)(%esp), %mm0; paddq (((30 - 7)&0xF)*8+64)(%esp), %mm0; movq (((30 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((30 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((30)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+30*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp);
+ movq (((31 -16)&0xF)*8+64)(%esp), %mm0; paddq (((31 - 7)&0xF)*8+64)(%esp), %mm0; movq (((31 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((31 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((31)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+31*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp);
+ movq (((32 -16)&0xF)*8+64)(%esp), %mm0; paddq (((32 - 7)&0xF)*8+64)(%esp), %mm0; movq (((32 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((32 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((32)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+32*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp);
+ movq (((33 -16)&0xF)*8+64)(%esp), %mm0; paddq (((33 - 7)&0xF)*8+64)(%esp), %mm0; movq (((33 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((33 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((33)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+33*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp);
+ movq (((34 -16)&0xF)*8+64)(%esp), %mm0; paddq (((34 - 7)&0xF)*8+64)(%esp), %mm0; movq (((34 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((34 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((34)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+34*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp);
+ movq (((35 -16)&0xF)*8+64)(%esp), %mm0; paddq (((35 - 7)&0xF)*8+64)(%esp), %mm0; movq (((35 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((35 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((35)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+35*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp);
+ movq (((36 -16)&0xF)*8+64)(%esp), %mm0; paddq (((36 - 7)&0xF)*8+64)(%esp), %mm0; movq (((36 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((36 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((36)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+36*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp);
+ movq (((37 -16)&0xF)*8+64)(%esp), %mm0; paddq (((37 - 7)&0xF)*8+64)(%esp), %mm0; movq (((37 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((37 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((37)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+37*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp);
+ movq (((38 -16)&0xF)*8+64)(%esp), %mm0; paddq (((38 - 7)&0xF)*8+64)(%esp), %mm0; movq (((38 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((38 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((38)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+38*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp);
+ movq (((39 -16)&0xF)*8+64)(%esp), %mm0; paddq (((39 - 7)&0xF)*8+64)(%esp), %mm0; movq (((39 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((39 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((39)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+39*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp);
+ movq (((40 -16)&0xF)*8+64)(%esp), %mm0; paddq (((40 - 7)&0xF)*8+64)(%esp), %mm0; movq (((40 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((40 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((40)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+40*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp);
+ movq (((41 -16)&0xF)*8+64)(%esp), %mm0; paddq (((41 - 7)&0xF)*8+64)(%esp), %mm0; movq (((41 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((41 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((41)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+41*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp);
+ movq (((42 -16)&0xF)*8+64)(%esp), %mm0; paddq (((42 - 7)&0xF)*8+64)(%esp), %mm0; movq (((42 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((42 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((42)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+42*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp);
+ movq (((43 -16)&0xF)*8+64)(%esp), %mm0; paddq (((43 - 7)&0xF)*8+64)(%esp), %mm0; movq (((43 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((43 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((43)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+43*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp);
+ movq (((44 -16)&0xF)*8+64)(%esp), %mm0; paddq (((44 - 7)&0xF)*8+64)(%esp), %mm0; movq (((44 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((44 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((44)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+44*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp);
+ movq (((45 -16)&0xF)*8+64)(%esp), %mm0; paddq (((45 - 7)&0xF)*8+64)(%esp), %mm0; movq (((45 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((45 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((45)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+45*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp);
+ movq (((46 -16)&0xF)*8+64)(%esp), %mm0; paddq (((46 - 7)&0xF)*8+64)(%esp), %mm0; movq (((46 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((46 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((46)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+46*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp);
+ movq (((47 -16)&0xF)*8+64)(%esp), %mm0; paddq (((47 - 7)&0xF)*8+64)(%esp), %mm0; movq (((47 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((47 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((47)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+47*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp);
+ movq (((48 -16)&0xF)*8+64)(%esp), %mm0; paddq (((48 - 7)&0xF)*8+64)(%esp), %mm0; movq (((48 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((48 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((48)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+48*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp);
+ movq (((49 -16)&0xF)*8+64)(%esp), %mm0; paddq (((49 - 7)&0xF)*8+64)(%esp), %mm0; movq (((49 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((49 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((49)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+49*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp);
+ movq (((50 -16)&0xF)*8+64)(%esp), %mm0; paddq (((50 - 7)&0xF)*8+64)(%esp), %mm0; movq (((50 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((50 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((50)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+50*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp);
+ movq (((51 -16)&0xF)*8+64)(%esp), %mm0; paddq (((51 - 7)&0xF)*8+64)(%esp), %mm0; movq (((51 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((51 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((51)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+51*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp);
+ movq (((52 -16)&0xF)*8+64)(%esp), %mm0; paddq (((52 - 7)&0xF)*8+64)(%esp), %mm0; movq (((52 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((52 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((52)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+52*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp);
+ movq (((53 -16)&0xF)*8+64)(%esp), %mm0; paddq (((53 - 7)&0xF)*8+64)(%esp), %mm0; movq (((53 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((53 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((53)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+53*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp);
+ movq (((54 -16)&0xF)*8+64)(%esp), %mm0; paddq (((54 - 7)&0xF)*8+64)(%esp), %mm0; movq (((54 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((54 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((54)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+54*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp);
+ movq (((55 -16)&0xF)*8+64)(%esp), %mm0; paddq (((55 - 7)&0xF)*8+64)(%esp), %mm0; movq (((55 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((55 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((55)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+55*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp);
+ movq (((56 -16)&0xF)*8+64)(%esp), %mm0; paddq (((56 - 7)&0xF)*8+64)(%esp), %mm0; movq (((56 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((56 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((56)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+56*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp);
+ movq (((57 -16)&0xF)*8+64)(%esp), %mm0; paddq (((57 - 7)&0xF)*8+64)(%esp), %mm0; movq (((57 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((57 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((57)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+57*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp);
+ movq (((58 -16)&0xF)*8+64)(%esp), %mm0; paddq (((58 - 7)&0xF)*8+64)(%esp), %mm0; movq (((58 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((58 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((58)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+58*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp);
+ movq (((59 -16)&0xF)*8+64)(%esp), %mm0; paddq (((59 - 7)&0xF)*8+64)(%esp), %mm0; movq (((59 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((59 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((59)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+59*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp);
+ movq (((60 -16)&0xF)*8+64)(%esp), %mm0; paddq (((60 - 7)&0xF)*8+64)(%esp), %mm0; movq (((60 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((60 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((60)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+60*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp);
+ movq (((61 -16)&0xF)*8+64)(%esp), %mm0; paddq (((61 - 7)&0xF)*8+64)(%esp), %mm0; movq (((61 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((61 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((61)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+61*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp);
+ movq (((62 -16)&0xF)*8+64)(%esp), %mm0; paddq (((62 - 7)&0xF)*8+64)(%esp), %mm0; movq (((62 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((62 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((62)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+62*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp);
+ movq (((63 -16)&0xF)*8+64)(%esp), %mm0; paddq (((63 - 7)&0xF)*8+64)(%esp), %mm0; movq (((63 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((63 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((63)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+63*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp);
+ movq (((64 -16)&0xF)*8+64)(%esp), %mm0; paddq (((64 - 7)&0xF)*8+64)(%esp), %mm0; movq (((64 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((64 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((64)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+64*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp);
+ movq (((65 -16)&0xF)*8+64)(%esp), %mm0; paddq (((65 - 7)&0xF)*8+64)(%esp), %mm0; movq (((65 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((65 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((65)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+65*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp);
+ movq (((66 -16)&0xF)*8+64)(%esp), %mm0; paddq (((66 - 7)&0xF)*8+64)(%esp), %mm0; movq (((66 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((66 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((66)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+66*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp);
+ movq (((67 -16)&0xF)*8+64)(%esp), %mm0; paddq (((67 - 7)&0xF)*8+64)(%esp), %mm0; movq (((67 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((67 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((67)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+67*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp);
+ movq (((68 -16)&0xF)*8+64)(%esp), %mm0; paddq (((68 - 7)&0xF)*8+64)(%esp), %mm0; movq (((68 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((68 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((68)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+68*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp);
+ movq (((69 -16)&0xF)*8+64)(%esp), %mm0; paddq (((69 - 7)&0xF)*8+64)(%esp), %mm0; movq (((69 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((69 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((69)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+69*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp);
+ movq (((70 -16)&0xF)*8+64)(%esp), %mm0; paddq (((70 - 7)&0xF)*8+64)(%esp), %mm0; movq (((70 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((70 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((70)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+70*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp);
+ movq (((71 -16)&0xF)*8+64)(%esp), %mm0; paddq (((71 - 7)&0xF)*8+64)(%esp), %mm0; movq (((71 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((71 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((71)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+71*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp);
+ movq (((72 -16)&0xF)*8+64)(%esp), %mm0; paddq (((72 - 7)&0xF)*8+64)(%esp), %mm0; movq (((72 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((72 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((72)&0xF)*8+64)(%esp); paddq (7*8)(%esp), %mm0; movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+72*8, %mm0; movq (6*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (3*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (3*8)(%esp); movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (2*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (1*8)(%esp), %mm3; pand (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (7*8)(%esp);
+ movq (((73 -16)&0xF)*8+64)(%esp), %mm0; paddq (((73 - 7)&0xF)*8+64)(%esp), %mm0; movq (((73 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((73 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((73)&0xF)*8+64)(%esp); paddq (6*8)(%esp), %mm0; movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+73*8, %mm0; movq (5*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm2; pxor (5*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (2*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (2*8)(%esp); movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (1*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (0*8)(%esp), %mm3; pand (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (6*8)(%esp);
+ movq (((74 -16)&0xF)*8+64)(%esp), %mm0; paddq (((74 - 7)&0xF)*8+64)(%esp), %mm0; movq (((74 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((74 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((74)&0xF)*8+64)(%esp); paddq (5*8)(%esp), %mm0; movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+74*8, %mm0; movq (4*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm2; pxor (4*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (1*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (1*8)(%esp); movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (0*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (7*8)(%esp), %mm3; pand (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (5*8)(%esp);
+ movq (((75 -16)&0xF)*8+64)(%esp), %mm0; paddq (((75 - 7)&0xF)*8+64)(%esp), %mm0; movq (((75 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((75 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((75)&0xF)*8+64)(%esp); paddq (4*8)(%esp), %mm0; movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+75*8, %mm0; movq (3*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm2; pxor (3*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (0*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (0*8)(%esp); movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (7*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (6*8)(%esp), %mm3; pand (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (4*8)(%esp);
+ movq (((76 -16)&0xF)*8+64)(%esp), %mm0; paddq (((76 - 7)&0xF)*8+64)(%esp), %mm0; movq (((76 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((76 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((76)&0xF)*8+64)(%esp); paddq (3*8)(%esp), %mm0; movq (0*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+76*8, %mm0; movq (2*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; pand (0*8)(%esp), %mm2; pxor (2*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (7*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (7*8)(%esp); movq (4*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (6*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (5*8)(%esp), %mm3; pand (5*8)(%esp), %mm2; pand (4*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (3*8)(%esp);
+ movq (((77 -16)&0xF)*8+64)(%esp), %mm0; paddq (((77 - 7)&0xF)*8+64)(%esp), %mm0; movq (((77 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((77 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((77)&0xF)*8+64)(%esp); paddq (2*8)(%esp), %mm0; movq (7*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+77*8, %mm0; movq (1*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; pand (7*8)(%esp), %mm2; pxor (1*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (6*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (6*8)(%esp); movq (3*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (5*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (4*8)(%esp), %mm3; pand (4*8)(%esp), %mm2; pand (3*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (2*8)(%esp);
+ movq (((78 -16)&0xF)*8+64)(%esp), %mm0; paddq (((78 - 7)&0xF)*8+64)(%esp), %mm0; movq (((78 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((78 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((78)&0xF)*8+64)(%esp); paddq (1*8)(%esp), %mm0; movq (6*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+78*8, %mm0; movq (0*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; pand (6*8)(%esp), %mm2; pxor (0*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (5*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (5*8)(%esp); movq (2*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (4*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (3*8)(%esp), %mm3; pand (3*8)(%esp), %mm2; pand (2*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (1*8)(%esp);
+ movq (((79 -16)&0xF)*8+64)(%esp), %mm0; paddq (((79 - 7)&0xF)*8+64)(%esp), %mm0; movq (((79 -15)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-1), %mm5; psrlq $1, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-8), %mm4; psrlq $8, %mm2; por %mm4, %mm2; psrlq $7, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq (((79 - 2)&0xF)*8+64)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm5; psllq $(64-19), %mm5; psrlq $19, %mm1; por %mm5, %mm1; movq %mm2, %mm4; psllq $(64-61), %mm4; psrlq $61, %mm2; por %mm4, %mm2; psrlq $6, %mm3; pxor %mm3, %mm2; pxor %mm2, %mm1; paddq %mm1, %mm0; movq %mm0, (((79)&0xF)*8+64)(%esp); paddq (0*8)(%esp), %mm0; movq (5*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-18), %mm4; psrlq $18, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-41), %mm5; psrlq $41, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-14), %mm6; psrlq $14, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; paddq .roundconstants+79*8, %mm0; movq (7*8)(%esp), %mm2; pxor (6*8)(%esp), %mm2; pand (5*8)(%esp), %mm2; pxor (7*8)(%esp), %mm2; paddq %mm1, %mm0; paddq %mm2, %mm0; movq (4*8)(%esp), %mm1; paddq %mm0, %mm1; movq %mm1, (4*8)(%esp); movq (1*8)(%esp), %mm1; movq %mm1, %mm2; movq %mm1, %mm3; movq %mm1, %mm4; psllq $(64-39), %mm4; psrlq $39, %mm1; por %mm4, %mm1; movq %mm2, %mm5; psllq $(64-34), %mm5; psrlq $34, %mm2; por %mm5, %mm2; movq %mm3, %mm6; psllq $(64-28), %mm6; psrlq $28, %mm3; por %mm6, %mm3; pxor %mm2, %mm1; pxor %mm3, %mm1; movq (3*8)(%esp), %mm2; paddq %mm1, %mm0; movq %mm2, %mm3; por (2*8)(%esp), %mm3; pand (2*8)(%esp), %mm2; pand (1*8)(%esp), %mm3; por %mm2, %mm3; paddq %mm3, %mm0; movq %mm0, (0*8)(%esp);
+
+
+ movl 4(%ecx), %eax
+ movdqu 0(%eax), %xmm0; paddq 0(%esp), %xmm0; movdqu %xmm0, 0(%eax)
+ movdqu 16(%eax), %xmm1; paddq 16(%esp), %xmm1; movdqu %xmm1, 16(%eax)
+ movdqu 32(%eax), %xmm2; paddq 32(%esp), %xmm2; movdqu %xmm2, 32(%eax)
+ movdqu 48(%eax), %xmm3; paddq 48(%esp), %xmm3; movdqu %xmm3, 48(%eax)
+
+
+ emms
+ movl %ecx, %esp
+
+ .ifdef MS_STDCALL
+ ret $8
+ .else
+ retl
+ .endif
+
+
+.balign 8
+.bswap64:
+ .quad 0x0001020304050607
+
+.roundconstants:
+ .quad 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC
+ .quad 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118
+ .quad 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2
+ .quad 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694
+ .quad 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65
+ .quad 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5
+ .quad 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4
+ .quad 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70
+ .quad 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF
+ .quad 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B
+ .quad 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30
+ .quad 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8
+ .quad 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8
+ .quad 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3
+ .quad 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC
+ .quad 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B
+ .quad 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178
+ .quad 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B
+ .quad 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C
+ .quad 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
+
+ .ifndef WINABI
+#if defined(__linux__) && defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+#endif
+ .endif \ No newline at end of file
diff --git a/src/Crypto/sha512_avx1_x64.asm b/src/Crypto/sha512_avx1_x64.asm
new file mode 100644
index 00000000..06321b5b
--- /dev/null
+++ b/src/Crypto/sha512_avx1_x64.asm
@@ -0,0 +1,427 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the
+; distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived from
+; this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows: yasm -f x64 -D WINABI sha512_avx.asm
+; Linux: yasm -f elf64 sha512_avx.asm
+;
+
+BITS 64
+section .text
+
+; Virtual Registers
+%ifdef WINABI
+ %define msg rcx ; ARG1
+ %define digest rdx ; ARG2
+ %define msglen r8 ; ARG3
+ %define T1 rsi
+ %define T2 rdi
+%else
+ %define msg rdi ; ARG1
+ %define digest rsi ; ARG2
+ %define msglen rdx ; ARG3
+ %define T1 rcx
+ %define T2 r8
+%endif
+%define a_64 r9
+%define b_64 r10
+%define c_64 r11
+%define d_64 r12
+%define e_64 r13
+%define f_64 r14
+%define g_64 r15
+%define h_64 rbx
+%define tmp0 rax
+
+; Local variables (stack frame)
+; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
+struc frame
+ .W: resq 80 ; Message Schedule
+ .WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1]
+
+%ifdef WINABI
+ .XMMSAVE: resdq 4
+ .GPRSAVE: resq 7
+%else
+ .GPRSAVE: resq 5
+%endif
+endstruc
+
+; Useful QWORD "arrays" for simpler memory references
+%define MSG(i) msg + 8*(i) ; Input message (arg1)
+%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2)
+%define K_t(i) K512 + 8*(i) wrt rip ; SHA Constants (static mem)
+%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame)
+%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
+; MSG, DIGEST, K_t, W_t are arrays
+; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+%macro RotateState 0
+ ; Rotate symbles a..h right
+ %xdefine %%TMP h_64
+ %xdefine h_64 g_64
+ %xdefine g_64 f_64
+ %xdefine f_64 e_64
+ %xdefine e_64 d_64
+ %xdefine d_64 c_64
+ %xdefine c_64 b_64
+ %xdefine b_64 a_64
+ %xdefine a_64 %%TMP
+%endmacro
+
+%macro RORQ 2
+ ; shld is faster than ror on Sandybridge
+ shld %1, %1, (64 - %2)
+%endmacro
+
+%macro SHA512_Round 1
+%assign %%t (%1)
+
+ ; Compute Round %%t
+ mov T1, f_64 ; T1 = f
+ mov tmp0, e_64 ; tmp = e
+ xor T1, g_64 ; T1 = f ^ g
+ RORQ tmp0, 23 ; 41 ; tmp = e ror 23
+ and T1, e_64 ; T1 = (f ^ g) & e
+ xor tmp0, e_64 ; tmp = (e ror 23) ^ e
+ xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler
+ RORQ tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4
+ xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov T2, a_64 ; T2 = a
+ add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h
+ RORQ tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov tmp0, a_64 ; tmp = a
+ xor T2, c_64 ; T2 = a ^ c
+ and tmp0, c_64 ; tmp = a & c
+ and T2, b_64 ; T2 = (a ^ c) & b
+ xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov tmp0, a_64 ; tmp = a
+ RORQ tmp0, 5 ; 39 ; tmp = a ror 5
+ xor tmp0, a_64 ; tmp = (a ror 5) ^ a
+ add d_64, T1 ; e(next_state) = d + T1
+ RORQ tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6
+ xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c)
+ RORQ tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+%endmacro
+
+%macro SHA512_2Sched_2Round_avx 1
+%assign %%t %1
+ ; Compute rounds %%t-2 and %%t-1
+ ; Compute message schedule QWORDS %%t and %%t+1
+
+ ; Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ ; scheduler.
+ ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+ ; They are then added to their respective SHA512 constants at
+ ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+ ; For brievity, the comments following vectored instructions only refer to
+ ; the first of a pair of QWORDS.
+ ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+ ; The computation of the message schedule and the rounds are tightly
+ ; stitched to take advantage of instruction-level parallelism.
+ ; For clarity, integer instructions (for the rounds calculation) are indented
+ ; by one tab. Vectored instructions (for the message scheduler) are indented
+ ; by two tabs.
+
+ vmovdqa xmm4, [W_t(%%t-2)] ; XMM4 = W[t-2]
+ vmovdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
+ mov T1, f_64
+ vpsrlq xmm0, xmm4, 61 ; XMM0 = W[t-2]>>61
+ mov tmp0, e_64
+ vpsrlq xmm6, xmm5, 1 ; XMM6 = W[t-15]>>1
+ xor T1, g_64
+ RORQ tmp0, 23 ; 41
+ vpsrlq xmm1, xmm4, 19 ; XMM1 = W[t-2]>>19
+ and T1, e_64
+ xor tmp0, e_64
+ vpxor xmm0, xmm1 ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+ xor T1, g_64
+ add T1, [WK_2(%%t)];
+ vpsrlq xmm7, xmm5, 8 ; XMM7 = W[t-15]>>8
+ RORQ tmp0, 4 ; 18
+ vpsrlq xmm2, xmm4, 6 ; XMM2 = W[t-2]>>6
+ xor tmp0, e_64
+ mov T2, a_64
+ add T1, h_64
+ vpxor xmm6, xmm7 ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+ RORQ tmp0, 14 ; 14
+ add T1, tmp0
+ vpsrlq xmm8, xmm5, 7 ; XMM8 = W[t-15]>>7
+ mov tmp0, a_64
+ xor T2, c_64
+ vpsllq xmm3, xmm4, (64-61) ; XMM3 = W[t-2]<<3
+ and tmp0, c_64
+ and T2, b_64
+ vpxor xmm2, xmm3 ; XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+ xor T2, tmp0
+ mov tmp0, a_64
+ vpsllq xmm9, xmm5, (64-1) ; XMM9 = W[t-15]<<63
+ RORQ tmp0, 5 ; 39
+ vpxor xmm8, xmm9 ; XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+ xor tmp0, a_64
+ add d_64, T1
+ RORQ tmp0, 6 ; 34
+ xor tmp0, a_64
+ vpxor xmm6, xmm8 ; XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63
+ lea h_64, [T1 + T2]
+ RORQ tmp0, 28 ; 28
+ vpsllq xmm4, (64-19) ; XMM4 = W[t-2]<<25
+ add h_64, tmp0
+ RotateState
+ vpxor xmm0, xmm4 ; XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25
+ mov T1, f_64
+ vpxor xmm0, xmm2 ; XMM0 = s1(W[t-2])
+ mov tmp0, e_64
+ xor T1, g_64
+ vpaddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + W[t-16]
+ vmovdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
+ RORQ tmp0, 23 ; 41
+ and T1, e_64
+ xor tmp0, e_64
+ xor T1, g_64
+ vpsllq xmm5, (64-8) ; XMM5 = W[t-15]<<56
+ add T1, [WK_2(%%t+1)]
+ vpxor xmm6, xmm5 ; XMM6 = s0(W[t-15])
+ RORQ tmp0, 4 ; 18
+ vpaddq xmm0, xmm6 ; XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+ xor tmp0, e_64
+ vpaddq xmm0, xmm1 ; XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+ mov T2, a_64
+ add T1, h_64
+ RORQ tmp0, 14 ; 14
+ add T1, tmp0
+ vmovdqa [W_t(%%t)], xmm0 ; Store W[t]
+ vpaddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ vmovdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds
+ mov tmp0, a_64
+ xor T2, c_64
+ and tmp0, c_64
+ and T2, b_64
+ xor T2, tmp0
+ mov tmp0, a_64
+ RORQ tmp0, 5 ; 39
+ xor tmp0, a_64
+ add d_64, T1
+ RORQ tmp0, 6 ; 34
+ xor tmp0, a_64
+ lea h_64, [T1 + T2]
+ RORQ tmp0, 28 ; 28
+ add h_64, tmp0
+ RotateState
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_avx(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+; message blocks.
+; L is the message length in SHA512 blocks
+global sha512_avx:function
+sha512_avx:
+ cmp msglen, 0
+ je .nowork
+
+ ; Allocate Stack Space
+ sub rsp, frame_size
+
+ ; Save GPRs
+ mov [rsp + frame.GPRSAVE + 8 * 0], rbx
+ mov [rsp + frame.GPRSAVE + 8 * 1], r12
+ mov [rsp + frame.GPRSAVE + 8 * 2], r13
+ mov [rsp + frame.GPRSAVE + 8 * 3], r14
+ mov [rsp + frame.GPRSAVE + 8 * 4], r15
+%ifdef WINABI
+ mov [rsp + frame.GPRSAVE + 8 * 5], rsi
+ mov [rsp + frame.GPRSAVE + 8 * 6], rdi
+%endif
+ ; Save XMMs
+%ifdef WINABI
+ vmovdqa [rsp + frame.XMMSAVE + 16 * 0], xmm6
+ vmovdqa [rsp + frame.XMMSAVE + 16 * 1], xmm7
+ vmovdqa [rsp + frame.XMMSAVE + 16 * 2], xmm8
+ vmovdqa [rsp + frame.XMMSAVE + 16 * 3], xmm9
+%endif
+
+.updateblock:
+
+ ; Load state variables
+ mov a_64, [DIGEST(0)]
+ mov b_64, [DIGEST(1)]
+ mov c_64, [DIGEST(2)]
+ mov d_64, [DIGEST(3)]
+ mov e_64, [DIGEST(4)]
+ mov f_64, [DIGEST(5)]
+ mov g_64, [DIGEST(6)]
+ mov h_64, [DIGEST(7)]
+
+ %assign t 0
+ %rep 80/2 + 1
+ ; (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ ; +1 iteration because the scheduler leads hashing by 1 iteration
+ %if t < 2
+ ; BSWAP 2 QWORDS
+ vmovdqa xmm1, [XMM_QWORD_BSWAP wrt rip]
+ vmovdqu xmm0, [MSG(t)]
+ vpshufb xmm0, xmm0, xmm1 ; BSWAP
+ vmovdqa [W_t(t)], xmm0 ; Store Scheduled Pair
+ vpaddq xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ vmovdqa [WK_2(t)], xmm0 ; Store into WK for rounds
+ %elif t < 16
+ ; BSWAP 2 QWORDS, Compute 2 Rounds
+ vmovdqu xmm0, [MSG(t)]
+ vpshufb xmm0, xmm0, xmm1 ; BSWAP
+ SHA512_Round t - 2 ; Round t-2
+ vmovdqa [W_t(t)], xmm0 ; Store Scheduled Pair
+ vpaddq xmm0, xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ SHA512_Round t - 1 ; Round t-1
+ vmovdqa [WK_2(t)], xmm0 ; W[t]+K[t] into WK
+ %elif t < 79
+ ; Schedule 2 QWORDS; Compute 2 Rounds
+ SHA512_2Sched_2Round_avx t
+ %else
+ ; Compute 2 Rounds
+ SHA512_Round t - 2
+ SHA512_Round t - 1
+ %endif
+ %assign t t+2
+ %endrep
+
+ ; Update digest
+ add [DIGEST(0)], a_64
+ add [DIGEST(1)], b_64
+ add [DIGEST(2)], c_64
+ add [DIGEST(3)], d_64
+ add [DIGEST(4)], e_64
+ add [DIGEST(5)], f_64
+ add [DIGEST(6)], g_64
+ add [DIGEST(7)], h_64
+
+ ; Advance to next message block
+ add msg, 16*8
+ dec msglen
+ jnz .updateblock
+
+ ; Restore XMMs
+%ifdef WINABI
+ vmovdqa xmm6, [rsp + frame.XMMSAVE + 16 * 0]
+ vmovdqa xmm7, [rsp + frame.XMMSAVE + 16 * 1]
+ vmovdqa xmm8, [rsp + frame.XMMSAVE + 16 * 2]
+ vmovdqa xmm9, [rsp + frame.XMMSAVE + 16 * 3]
+%endif
+ ; Restore GPRs
+ mov rbx, [rsp + frame.GPRSAVE + 8 * 0]
+ mov r12, [rsp + frame.GPRSAVE + 8 * 1]
+ mov r13, [rsp + frame.GPRSAVE + 8 * 2]
+ mov r14, [rsp + frame.GPRSAVE + 8 * 3]
+ mov r15, [rsp + frame.GPRSAVE + 8 * 4]
+%ifdef WINABI
+ mov rsi, [rsp + frame.GPRSAVE + 8 * 5]
+ mov rdi, [rsp + frame.GPRSAVE + 8 * 6]
+%endif
+ ; Restore Stack Pointer
+ add rsp, frame_size
+
+.nowork:
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+
+section .data
+
+ALIGN 16
+
+; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ ddq 0x08090a0b0c0d0e0f0001020304050607
+
+; K[t] used in SHA512 hashing
+K512:
+ dq 0x428a2f98d728ae22,0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538,0x59f111f1b605d019
+ dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242,0x12835b0145706fbe
+ dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235,0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ dq 0x983e5152ee66dfab,0xa831c66d2db43210
+ dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ dq 0x06ca6351e003826f,0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ dq 0x650a73548baf63de,0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6,0x92722c851482353b
+ dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+ dq 0xd192e819d6ef5218,0xd69906245565a910
+ dq 0xf40e35855771202a,0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc,0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ dq 0x90befffa23631e28,0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ dq 0xca273eceea26619c,0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae,0x1b710b35131c471b
+ dq 0x28db77f523047d84,0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha512_avx1_x86.asm b/src/Crypto/sha512_avx1_x86.asm
new file mode 100644
index 00000000..31c8bd0d
--- /dev/null
+++ b/src/Crypto/sha512_avx1_x86.asm
@@ -0,0 +1,10 @@
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha512_avx2_x64.asm b/src/Crypto/sha512_avx2_x64.asm
new file mode 100644
index 00000000..1ba08665
--- /dev/null
+++ b/src/Crypto/sha512_avx2_x64.asm
@@ -0,0 +1,804 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the
+; distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived from
+; this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows: yasm -f x64 -D WINABI sha512_rorx.asm
+; Linux: yasm -f elf64 sha512_rorx.asm
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+BITS 64
+section .text
+
+; Virtual Registers
+%define Y_0 ymm4
+%define Y_1 ymm5
+%define Y_2 ymm6
+%define Y_3 ymm7
+
+%define YTMP0 ymm0
+%define YTMP1 ymm1
+%define YTMP2 ymm2
+%define YTMP3 ymm3
+%define YTMP4 ymm8
+%define XFER YTMP0
+
+%define BYTE_FLIP_MASK ymm9
+
+%ifdef WINABI
+ %define INP rcx ; 1st arg
+ %define CTX rdx ; 2nd arg
+ %define NUM_BLKS r8 ; 3rd arg
+ %define c rdi
+ %define d rsi
+ %define e r8
+ %define y3 rcx
+%else
+ %define INP rdi ; 1st arg
+ %define CTX rsi ; 2nd arg
+ %define NUM_BLKS rdx ; 3rd arg
+ %define c rcx
+ %define d r8
+ %define e rdx
+ %define y3 rdi
+%endif
+
+%define TBL rbp
+
+%define a rax
+%define b rbx
+
+%define f r9
+%define g r10
+%define h r11
+%define old_h r11
+
+%define T1 r12
+%define y0 r13
+%define y1 r14
+%define y2 r15
+
+%define y4 r12
+
+; Local variables (stack frame)
+struc frame
+ .XFER: resq 4
+ .SRND: resq 1
+ .INP: resq 1
+ .INPEND: resq 1
+ .RSPSAVE: resq 1
+
+%ifdef WINABI
+ .XMMSAVE: resdq 4
+ .GPRSAVE: resq 8
+%else
+ .GPRSAVE: resq 6
+%endif
+endstruc
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+; addm [mem], reg
+; Add reg to mem using reg-mem add and store
+%macro addm 2
+ add %2, %1
+ mov %1, %2
+%endm
+
+
+; COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+; Load ymm with mem and byte swap each dword
+%macro COPY_YMM_AND_BSWAP 3
+ VMOVDQ %1, %2
+ vpshufb %1, %1 ,%3
+%endmacro
+; rotate_Ys
+; Rotate values of symbols Y0...Y3
+%macro rotate_Ys 0
+ %xdefine %%Y_ Y_0
+ %xdefine Y_0 Y_1
+ %xdefine Y_1 Y_2
+ %xdefine Y_2 Y_3
+ %xdefine Y_3 %%Y_
+%endm
+
+; RotateState
+%macro RotateState 0
+ ; Rotate symbles a..h right
+ %xdefine old_h h
+ %xdefine %%TMP_ h
+ %xdefine h g
+ %xdefine g f
+ %xdefine f e
+ %xdefine e d
+ %xdefine d c
+ %xdefine c b
+ %xdefine b a
+ %xdefine a %%TMP_
+%endm
+
+; %macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
+; YDST = {YSRC1, YSRC2} >> RVAL*8
+%macro MY_VPALIGNR 4
+%define %%YDST %1
+%define %%YSRC1 %2
+%define %%YSRC2 %3
+%define %%RVAL %4
+ vperm2f128 %%YDST, %%YSRC1, %%YSRC2, 0x3 ; YDST = {YS1_LO, YS2_HI}
+ vpalignr %%YDST, %%YDST, %%YSRC2, %%RVAL ; YDST = {YDS1, YS2} >> RVAL*8
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ; Extract w[t-7]
+ MY_VPALIGNR YTMP0, Y_3, Y_2, 8 ; YTMP0 = W[-7]
+ ; Calculate w[t-16] + w[t-7]
+ vpaddq YTMP0, YTMP0, Y_0 ; YTMP0 = W[-7] + W[-16]
+ ; Extract w[t-15]
+ MY_VPALIGNR YTMP1, Y_1, Y_0, 8 ; YTMP1 = W[-15]
+
+ ; Calculate sigma0
+
+ ; Calculate w[t-15] ror 1
+ vpsrlq YTMP2, YTMP1, 1
+ vpsllq YTMP3, YTMP1, (64-1)
+ vpor YTMP3, YTMP3, YTMP2 ; YTMP3 = W[-15] ror 1
+ ; Calculate w[t-15] shr 7
+ vpsrlq YTMP4, YTMP1, 7 ; YTMP4 = W[-15] >> 7
+
+ mov y3, a ; y3 = a ; MAJA
+ rorx y0, e, 41 ; y0 = e >> 41 ; S1A
+ rorx y1, e, 18 ; y1 = e >> 18 ; S1B
+
+ add h, [rsp+frame.XFER+0*8] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+ mov y2, f ; y2 = f ; CH
+ rorx T1, a, 34 ; T1 = a >> 34 ; S0B
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1
+ xor y2, g ; y2 = f^g ; CH
+ rorx y1, e, 14 ; y1 = (e >> 14) ; S1
+
+ and y2, e ; y2 = (f^g)&e ; CH
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1
+ rorx y1, a, 39 ; y1 = a >> 39 ; S0A
+ add d, h ; d = k + w + h + d ; --
+
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0
+ rorx T1, a, 28 ; T1 = (a >> 28) ; S0
+
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and T1, c ; T1 = a&c ; MAJB
+
+ add y2, y0 ; y2 = S1 + CH ; --
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+ add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+ add h, y3 ; h = t1 + S0 + MAJ ; --
+
+RotateState
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ; Calculate w[t-15] ror 8
+ vpsrlq YTMP2, YTMP1, 8
+ vpsllq YTMP1, YTMP1, (64-8)
+ vpor YTMP1, YTMP1, YTMP2 ; YTMP1 = W[-15] ror 8
+ ; XOR the three components
+ vpxor YTMP3, YTMP3, YTMP4 ; YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+ vpxor YTMP1, YTMP3, YTMP1 ; YTMP1 = s0
+
+
+ ; Add three components, w[t-16], w[t-7] and sigma0
+ vpaddq YTMP0, YTMP0, YTMP1 ; YTMP0 = W[-16] + W[-7] + s0
+ ; Move to appropriate lanes for calculating w[16] and w[17]
+ vperm2f128 Y_0, YTMP0, YTMP0, 0x0 ; Y_0 = W[-16] + W[-7] + s0 {BABA}
+ ; Move to appropriate lanes for calculating w[18] and w[19]
+ vpand YTMP0, YTMP0, [MASK_YMM_LO wrt rip] ; YTMP0 = W[-16] + W[-7] + s0 {DC00}
+
+ ; Calculate w[16] and w[17] in both 128 bit lanes
+
+ ; Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+ vperm2f128 YTMP2, Y_3, Y_3, 0x11 ; YTMP2 = W[-2] {BABA}
+ vpsrlq YTMP4, YTMP2, 6 ; YTMP4 = W[-2] >> 6 {BABA}
+
+
+ mov y3, a ; y3 = a ; MAJA
+ rorx y0, e, 41 ; y0 = e >> 41 ; S1A
+ rorx y1, e, 18 ; y1 = e >> 18 ; S1B
+ add h, [rsp+frame.XFER+1*8] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+
+ mov y2, f ; y2 = f ; CH
+ rorx T1, a, 34 ; T1 = a >> 34 ; S0B
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1
+ xor y2, g ; y2 = f^g ; CH
+
+
+ rorx y1, e, 14 ; y1 = (e >> 14) ; S1
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1
+ rorx y1, a, 39 ; y1 = a >> 39 ; S0A
+ and y2, e ; y2 = (f^g)&e ; CH
+ add d, h ; d = k + w + h + d ; --
+
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0
+
+ rorx T1, a, 28 ; T1 = (a >> 28) ; S0
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+ add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+ add h, y3 ; h = t1 + S0 + MAJ ; --
+
+RotateState
+
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpsrlq YTMP3, YTMP2, 19 ; YTMP3 = W[-2] >> 19 {BABA}
+ vpsllq YTMP1, YTMP2, (64-19) ; YTMP1 = W[-2] << 19 {BABA}
+ vpor YTMP3, YTMP3, YTMP1 ; YTMP3 = W[-2] ror 19 {BABA}
+ vpxor YTMP4, YTMP4, YTMP3 ; YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+ vpsrlq YTMP3, YTMP2, 61 ; YTMP3 = W[-2] >> 61 {BABA}
+ vpsllq YTMP1, YTMP2, (64-61) ; YTMP1 = W[-2] << 61 {BABA}
+ vpor YTMP3, YTMP3, YTMP1 ; YTMP3 = W[-2] ror 61 {BABA}
+ vpxor YTMP4, YTMP4, YTMP3 ; YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+
+ ; Add sigma1 to the other compunents to get w[16] and w[17]
+ vpaddq Y_0, Y_0, YTMP4 ; Y_0 = {W[1], W[0], W[1], W[0]}
+
+ ; Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+ vpsrlq YTMP4, Y_0, 6 ; YTMP4 = W[-2] >> 6 {DC--}
+
+ mov y3, a ; y3 = a ; MAJA
+ rorx y0, e, 41 ; y0 = e >> 41 ; S1A
+ add h, [rsp+frame.XFER+2*8] ; h = k + w + h ; --
+
+ rorx y1, e, 18 ; y1 = e >> 18 ; S1B
+ or y3, c ; y3 = a|c ; MAJA
+ mov y2, f ; y2 = f ; CH
+ xor y2, g ; y2 = f^g ; CH
+
+ rorx T1, a, 34 ; T1 = a >> 34 ; S0B
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+
+ rorx y1, e, 14 ; y1 = (e >> 14) ; S1
+ add d, h ; d = k + w + h + d ; --
+ and y3, b ; y3 = (a|c)&b ; MAJA
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1
+ rorx y1, a, 39 ; y1 = a >> 39 ; S0A
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0
+ rorx T1, a, 28 ; T1 = (a >> 28) ; S0
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+ add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+
+ add h, y3 ; h = t1 + S0 + MAJ ; --
+
+RotateState
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpsrlq YTMP3, Y_0, 19 ; YTMP3 = W[-2] >> 19 {DC--}
+ vpsllq YTMP1, Y_0, (64-19) ; YTMP1 = W[-2] << 19 {DC--}
+ vpor YTMP3, YTMP3, YTMP1 ; YTMP3 = W[-2] ror 19 {DC--}
+ vpxor YTMP4, YTMP4, YTMP3 ; YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+ vpsrlq YTMP3, Y_0, 61 ; YTMP3 = W[-2] >> 61 {DC--}
+ vpsllq YTMP1, Y_0, (64-61) ; YTMP1 = W[-2] << 61 {DC--}
+ vpor YTMP3, YTMP3, YTMP1 ; YTMP3 = W[-2] ror 61 {DC--}
+ vpxor YTMP4, YTMP4, YTMP3 ; YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+
+ ; Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19]
+ vpaddq YTMP2, YTMP0, YTMP4 ; YTMP2 = {W[3], W[2], --, --}
+
+ ; Form w[19, w[18], w17], w[16]
+ vpblendd Y_0, Y_0, YTMP2, 0xF0 ; Y_0 = {W[3], W[2], W[1], W[0]}
+; vperm2f128 Y_0, Y_0, YTMP2, 0x30
+
+ mov y3, a ; y3 = a ; MAJA
+ rorx y0, e, 41 ; y0 = e >> 41 ; S1A
+ rorx y1, e, 18 ; y1 = e >> 18 ; S1B
+ add h, [rsp+frame.XFER+3*8] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+
+ mov y2, f ; y2 = f ; CH
+ rorx T1, a, 34 ; T1 = a >> 34 ; S0B
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1
+ xor y2, g ; y2 = f^g ; CH
+
+
+ rorx y1, e, 14 ; y1 = (e >> 14) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+ add d, h ; d = k + w + h + d ; --
+ and y3, b ; y3 = (a|c)&b ; MAJA
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+
+ rorx y1, a, 39 ; y1 = a >> 39 ; S0A
+ add y2, y0 ; y2 = S1 + CH ; --
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+ rorx T1, a, 28 ; T1 = (a >> 28) ; S0
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and T1, c ; T1 = a&c ; MAJB
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+
+ add h, y1 ; h = k + w + h + S0 ; --
+ add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+ add h, y3 ; h = t1 + S0 + MAJ ; --
+
+RotateState
+
+rotate_Ys
+%endm
+
+%macro DO_4ROUNDS 0
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov y2, f ; y2 = f ; CH
+ rorx y0, e, 41 ; y0 = e >> 41 ; S1A
+ rorx y1, e, 18 ; y1 = e >> 18 ; S1B
+ xor y2, g ; y2 = f^g ; CH
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1
+ rorx y1, e, 14 ; y1 = (e >> 14) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1
+ rorx T1, a, 34 ; T1 = a >> 34 ; S0B
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+ rorx y1, a, 39 ; y1 = a >> 39 ; S0A
+ mov y3, a ; y3 = a ; MAJA
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0
+ rorx T1, a, 28 ; T1 = (a >> 28) ; S0
+ add h, [rsp + frame.XFER + 8*0] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+
+ add d, h ; d = k + w + h + d ; --
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+
+ ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+
+ ;add h, y3 ; h = t1 + S0 + MAJ ; --
+
+ RotateState
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+ mov y2, f ; y2 = f ; CH
+ rorx y0, e, 41 ; y0 = e >> 41 ; S1A
+ rorx y1, e, 18 ; y1 = e >> 18 ; S1B
+ xor y2, g ; y2 = f^g ; CH
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1
+ rorx y1, e, 14 ; y1 = (e >> 14) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+ add old_h, y3 ; h = t1 + S0 + MAJ ; --
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1
+ rorx T1, a, 34 ; T1 = a >> 34 ; S0B
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+ rorx y1, a, 39 ; y1 = a >> 39 ; S0A
+ mov y3, a ; y3 = a ; MAJA
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0
+ rorx T1, a, 28 ; T1 = (a >> 28) ; S0
+ add h, [rsp + frame.XFER + 8*1] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+
+ add d, h ; d = k + w + h + d ; --
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+
+ ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+
+ ;add h, y3 ; h = t1 + S0 + MAJ ; --
+
+ RotateState
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+ mov y2, f ; y2 = f ; CH
+ rorx y0, e, 41 ; y0 = e >> 41 ; S1A
+ rorx y1, e, 18 ; y1 = e >> 18 ; S1B
+ xor y2, g ; y2 = f^g ; CH
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1
+ rorx y1, e, 14 ; y1 = (e >> 14) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+ add old_h, y3 ; h = t1 + S0 + MAJ ; --
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1
+ rorx T1, a, 34 ; T1 = a >> 34 ; S0B
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+ rorx y1, a, 39 ; y1 = a >> 39 ; S0A
+ mov y3, a ; y3 = a ; MAJA
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0
+ rorx T1, a, 28 ; T1 = (a >> 28) ; S0
+ add h, [rsp + frame.XFER + 8*2] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+
+ add d, h ; d = k + w + h + d ; --
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+
+ ;add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+
+ ;add h, y3 ; h = t1 + S0 + MAJ ; --
+
+ RotateState
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ add old_h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+ mov y2, f ; y2 = f ; CH
+ rorx y0, e, 41 ; y0 = e >> 41 ; S1A
+ rorx y1, e, 18 ; y1 = e >> 18 ; S1B
+ xor y2, g ; y2 = f^g ; CH
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ; S1
+ rorx y1, e, 14 ; y1 = (e >> 14) ; S1
+ and y2, e ; y2 = (f^g)&e ; CH
+ add old_h, y3 ; h = t1 + S0 + MAJ ; --
+
+ xor y0, y1 ; y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1
+ rorx T1, a, 34 ; T1 = a >> 34 ; S0B
+ xor y2, g ; y2 = CH = ((f^g)&e)^g ; CH
+ rorx y1, a, 39 ; y1 = a >> 39 ; S0A
+ mov y3, a ; y3 = a ; MAJA
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ; S0
+ rorx T1, a, 28 ; T1 = (a >> 28) ; S0
+ add h, [rsp + frame.XFER + 8*3] ; h = k + w + h ; --
+ or y3, c ; y3 = a|c ; MAJA
+
+ xor y1, T1 ; y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0
+ mov T1, a ; T1 = a ; MAJB
+ and y3, b ; y3 = (a|c)&b ; MAJA
+ and T1, c ; T1 = a&c ; MAJB
+ add y2, y0 ; y2 = S1 + CH ; --
+
+
+ add d, h ; d = k + w + h + d ; --
+ or y3, T1 ; y3 = MAJ = (a|c)&b)|(a&c) ; MAJ
+ add h, y1 ; h = k + w + h + S0 ; --
+
+ add d, y2 ; d = k + w + h + d + S1 + CH = d + t1 ; --
+
+
+ add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; --
+
+ add h, y3 ; h = t1 + S0 + MAJ ; --
+
+ RotateState
+
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_rorx(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+; message blocks.
+; L is the message length in SHA512 blocks
+global sha512_rorx:function
+global _sha512_rorx:function
+sha512_rorx:
+_sha512_rorx:
+
+ ; Allocate Stack Space
+ mov rax, rsp
+ sub rsp, frame_size
+ and rsp, ~(0x20 - 1)
+ mov [rsp + frame.RSPSAVE], rax
+
+ ; Save GPRs
+ mov [rsp + frame.GPRSAVE + 8 * 0], rbp
+ mov [rsp + frame.GPRSAVE + 8 * 1], rbx
+ mov [rsp + frame.GPRSAVE + 8 * 2], r12
+ mov [rsp + frame.GPRSAVE + 8 * 3], r13
+ mov [rsp + frame.GPRSAVE + 8 * 4], r14
+ mov [rsp + frame.GPRSAVE + 8 * 5], r15
+%ifdef WINABI
+ mov [rsp + frame.GPRSAVE + 8 * 6], rsi
+ mov [rsp + frame.GPRSAVE + 8 * 7], rdi
+%endif
+
+%ifdef WINABI
+ vmovdqa [rsp + frame.XMMSAVE + 0*16], xmm6
+ vmovdqa [rsp + frame.XMMSAVE + 1*16], xmm7
+ vmovdqa [rsp + frame.XMMSAVE + 2*16], xmm8
+ vmovdqa [rsp + frame.XMMSAVE + 3*16], xmm9
+%endif
+
+ vpblendd xmm0, xmm0, xmm1, 0xf0
+ vpblendd ymm0, ymm0, ymm1, 0xf0
+
+ shl NUM_BLKS, 7 ; convert to bytes
+ jz done_hash
+ add NUM_BLKS, INP ; pointer to end of data
+ mov [rsp + frame.INPEND], NUM_BLKS
+
+ ;; load initial digest
+ mov a,[8*0 + CTX]
+ mov b,[8*1 + CTX]
+ mov c,[8*2 + CTX]
+ mov d,[8*3 + CTX]
+ mov e,[8*4 + CTX]
+ mov f,[8*5 + CTX]
+ mov g,[8*6 + CTX]
+ mov h,[8*7 + CTX]
+
+ vmovdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+
+loop0:
+ lea TBL,[K512 wrt rip]
+
+ ;; byte swap first 16 dwords
+ COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK
+
+ mov [rsp + frame.INP], INP
+
+ ;; schedule 64 input dwords, by doing 12 rounds of 4 each
+ mov qword[rsp + frame.SRND],4
+
+align 16
+loop1:
+ vpaddq XFER, Y_0, [TBL + 0*32]
+ vmovdqa [rsp + frame.XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq XFER, Y_0, [TBL + 1*32]
+ vmovdqa [rsp + frame.XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq XFER, Y_0, [TBL + 2*32]
+ vmovdqa [rsp + frame.XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq XFER, Y_0, [TBL + 3*32]
+ vmovdqa [rsp + frame.XFER], XFER
+ add TBL, 4*32
+ FOUR_ROUNDS_AND_SCHED
+
+ sub qword[rsp + frame.SRND], 1
+ jne loop1
+
+ mov qword[rsp + frame.SRND], 2
+loop2:
+ vpaddq XFER, Y_0, [TBL + 0*32]
+ vmovdqa [rsp + frame.XFER], XFER
+ DO_4ROUNDS
+ vpaddq XFER, Y_1, [TBL + 1*32]
+ vmovdqa [rsp + frame.XFER], XFER
+ add TBL, 2*32
+ DO_4ROUNDS
+
+ vmovdqa Y_0, Y_2
+ vmovdqa Y_1, Y_3
+
+ sub qword[rsp + frame.SRND], 1
+ jne loop2
+
+ addm [8*0 + CTX],a
+ addm [8*1 + CTX],b
+ addm [8*2 + CTX],c
+ addm [8*3 + CTX],d
+ addm [8*4 + CTX],e
+ addm [8*5 + CTX],f
+ addm [8*6 + CTX],g
+ addm [8*7 + CTX],h
+
+ mov INP, [rsp + frame.INP]
+ add INP, 128
+ cmp INP, [rsp + frame.INPEND]
+ jne loop0
+
+ done_hash:
+%ifdef WINABI
+ vmovdqa xmm6, [rsp + frame.XMMSAVE + 0*16]
+ vmovdqa xmm7, [rsp + frame.XMMSAVE + 1*16]
+ vmovdqa xmm8, [rsp + frame.XMMSAVE + 2*16]
+ vmovdqa xmm9, [rsp + frame.XMMSAVE + 3*16]
+%endif
+
+; Restore GPRs
+ mov rbp, [rsp + frame.GPRSAVE + 8 * 0]
+ mov rbx, [rsp + frame.GPRSAVE + 8 * 1]
+ mov r12, [rsp + frame.GPRSAVE + 8 * 2]
+ mov r13, [rsp + frame.GPRSAVE + 8 * 3]
+ mov r14, [rsp + frame.GPRSAVE + 8 * 4]
+ mov r15, [rsp + frame.GPRSAVE + 8 * 5]
+%ifdef WINABI
+ mov rsi, [rsp + frame.GPRSAVE + 8 * 6]
+ mov rdi, [rsp + frame.GPRSAVE + 8 * 7]
+%endif
+ ; Restore Stack Pointer
+ mov rsp, [rsp + frame.RSPSAVE]
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+
+section .data
+
+align 64
+; K[t] used in SHA512 hashing
+K512:
+ dq 0x428a2f98d728ae22,0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538,0x59f111f1b605d019
+ dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242,0x12835b0145706fbe
+ dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235,0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ dq 0x983e5152ee66dfab,0xa831c66d2db43210
+ dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ dq 0x06ca6351e003826f,0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ dq 0x650a73548baf63de,0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6,0x92722c851482353b
+ dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+ dq 0xd192e819d6ef5218,0xd69906245565a910
+ dq 0xf40e35855771202a,0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc,0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ dq 0x90befffa23631e28,0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ dq 0xca273eceea26619c,0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae,0x1b710b35131c471b
+ dq 0x28db77f523047d84,0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+align 32
+
+; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK: ddq 0x08090a0b0c0d0e0f0001020304050607
+ ddq 0x18191a1b1c1d1e1f1011121314151617
+
+MASK_YMM_LO: ddq 0x00000000000000000000000000000000
+ ddq 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha512_avx2_x86.asm b/src/Crypto/sha512_avx2_x86.asm
new file mode 100644
index 00000000..31c8bd0d
--- /dev/null
+++ b/src/Crypto/sha512_avx2_x86.asm
@@ -0,0 +1,10 @@
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha512_sse4_x64.asm b/src/Crypto/sha512_sse4_x64.asm
new file mode 100644
index 00000000..d4a99875
--- /dev/null
+++ b/src/Crypto/sha512_sse4_x64.asm
@@ -0,0 +1,416 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the
+; distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived from
+; this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Example YASM command lines:
+; Windows: yasm -f x64 -D WINABI sha512_sse4.asm
+; Linux: yasm -f elf64 sha512_sse4.asm
+;
+
+# Modified by kerukuro for use in cppcrypto.
+
+BITS 64
+section .text
+
+; Virtual Registers
+%ifdef WINABI
+ %define msg rcx ; ARG1
+ %define digest rdx ; ARG2
+ %define msglen r8 ; ARG3
+ %define T1 rsi
+ %define T2 rdi
+%else
+ %define msg rdi ; ARG1
+ %define digest rsi ; ARG2
+ %define msglen rdx ; ARG3
+ %define T1 rcx
+ %define T2 r8
+%endif
+%define a_64 r9
+%define b_64 r10
+%define c_64 r11
+%define d_64 r12
+%define e_64 r13
+%define f_64 r14
+%define g_64 r15
+%define h_64 rbx
+%define tmp0 rax
+
+; Local variables (stack frame)
+; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
+struc frame
+ .W: resq 80 ; Message Schedule
+ .WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1]
+
+%ifdef WINABI
+ .GPRSAVE: resq 7
+%else
+ .GPRSAVE: resq 5
+%endif
+endstruc
+
+; Useful QWORD "arrays" for simpler memory references
+%define MSG(i) msg + 8*(i) ; Input message (arg1)
+%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2)
+%define K_t(i) K512 + 8*(i) wrt rip ; SHA Constants (static mem)
+%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame)
+%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
+; MSG, DIGEST, K_t, W_t are arrays
+; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+%macro RotateState 0
+ ; Rotate symbles a..h right
+ %xdefine %%TMP h_64
+ %xdefine h_64 g_64
+ %xdefine g_64 f_64
+ %xdefine f_64 e_64
+ %xdefine e_64 d_64
+ %xdefine d_64 c_64
+ %xdefine c_64 b_64
+ %xdefine b_64 a_64
+ %xdefine a_64 %%TMP
+%endmacro
+
+%macro SHA512_Round 1
+%assign %%t (%1)
+
+ ; Compute Round %%t
+ mov T1, f_64 ; T1 = f
+ mov tmp0, e_64 ; tmp = e
+ xor T1, g_64 ; T1 = f ^ g
+ ror tmp0, 23 ; 41 ; tmp = e ror 23
+ and T1, e_64 ; T1 = (f ^ g) & e
+ xor tmp0, e_64 ; tmp = (e ror 23) ^ e
+ xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler
+ ror tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4
+ xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov T2, a_64 ; T2 = a
+ add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h
+ ror tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov tmp0, a_64 ; tmp = a
+ xor T2, c_64 ; T2 = a ^ c
+ and tmp0, c_64 ; tmp = a & c
+ and T2, b_64 ; T2 = (a ^ c) & b
+ xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov tmp0, a_64 ; tmp = a
+ ror tmp0, 5 ; 39 ; tmp = a ror 5
+ xor tmp0, a_64 ; tmp = (a ror 5) ^ a
+ add d_64, T1 ; e(next_state) = d + T1
+ ror tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6
+ xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c)
+ ror tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+%endmacro
+
+%macro SHA512_2Sched_2Round_sse 1
+%assign %%t (%1)
+
+ ; Compute rounds %%t-2 and %%t-1
+ ; Compute message schedule QWORDS %%t and %%t+1
+
+ ; Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ ; scheduler.
+ ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+ ; They are then added to their respective SHA512 constants at
+ ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+ ; For brievity, the comments following vectored instructions only refer to
+ ; the first of a pair of QWORDS.
+ ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+ ; The computation of the message schedule and the rounds are tightly
+ ; stitched to take advantage of instruction-level parallelism.
+ ; For clarity, integer instructions (for the rounds calculation) are indented
+ ; by one tab. Vectored instructions (for the message scheduler) are indented
+ ; by two tabs.
+
+ mov T1, f_64
+ movdqa xmm2, [W_t(%%t-2)] ; XMM2 = W[t-2]
+ xor T1, g_64
+ and T1, e_64
+ movdqa xmm0, xmm2 ; XMM0 = W[t-2]
+ xor T1, g_64
+ add T1, [WK_2(%%t)]
+ movdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
+ mov tmp0, e_64
+ ror tmp0, 23 ; 41
+ movdqa xmm3, xmm5 ; XMM3 = W[t-15]
+ xor tmp0, e_64
+ ror tmp0, 4 ; 18
+ psrlq xmm0, 61 - 19 ; XMM0 = W[t-2] >> 42
+ xor tmp0, e_64
+ ror tmp0, 14 ; 14
+ psrlq xmm3, (8 - 7) ; XMM3 = W[t-15] >> 1
+ add T1, tmp0
+ add T1, h_64
+ pxor xmm0, xmm2 ; XMM0 = (W[t-2] >> 42) ^ W[t-2]
+ mov T2, a_64
+ xor T2, c_64
+ pxor xmm3, xmm5 ; XMM3 = (W[t-15] >> 1) ^ W[t-15]
+ and T2, b_64
+ mov tmp0, a_64
+ psrlq xmm0, 19 - 6 ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+ and tmp0, c_64
+ xor T2, tmp0
+ psrlq xmm3, (7 - 1) ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+ mov tmp0, a_64
+ ror tmp0, 5 ; 39
+ pxor xmm0, xmm2 ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+ xor tmp0, a_64
+ ror tmp0, 6 ; 34
+ pxor xmm3, xmm5 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+ xor tmp0, a_64
+ ror tmp0, 28 ; 28
+ psrlq xmm0, 6 ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+ add T2, tmp0
+ add d_64, T1
+ psrlq xmm3, 1 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+ lea h_64, [T1 + T2]
+ RotateState
+ movdqa xmm1, xmm2 ; XMM1 = W[t-2]
+ mov T1, f_64
+ xor T1, g_64
+ movdqa xmm4, xmm5 ; XMM4 = W[t-15]
+ and T1, e_64
+ xor T1, g_64
+ psllq xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42
+ add T1, [WK_2(%%t+1)]
+ mov tmp0, e_64
+ psllq xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7
+ ror tmp0, 23 ; 41
+ xor tmp0, e_64
+ pxor xmm1, xmm2 ; XMM1 = (W[t-2] << 42)^W[t-2]
+ ror tmp0, 4 ; 18
+ xor tmp0, e_64
+ pxor xmm4, xmm5 ; XMM4 = (W[t-15]<<7)^W[t-15]
+ ror tmp0, 14 ; 14
+ add T1, tmp0
+ psllq xmm1, (64 - 61) ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+ add T1, h_64
+ mov T2, a_64
+ psllq xmm4, (64 - 8) ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+ xor T2, c_64
+ and T2, b_64
+ pxor xmm0, xmm1 ; XMM0 = s1(W[t-2])
+ mov tmp0, a_64
+ and tmp0, c_64
+ movdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
+ xor T2, tmp0
+ pxor xmm3, xmm4 ; XMM3 = s0(W[t-15])
+ mov tmp0, a_64
+ paddq xmm0, xmm3 ; XMM0 = s1(W[t-2]) + s0(W[t-15])
+ ror tmp0, 5 ; 39
+ paddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+ xor tmp0, a_64
+ paddq xmm0, xmm1 ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+ ror tmp0, 6 ; 34
+ movdqa [W_t(%%t)], xmm0 ; Store scheduled qwords
+ xor tmp0, a_64
+ paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ ror tmp0, 28 ; 28
+ movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds
+ add T2, tmp0
+ add d_64, T1
+ lea h_64, [T1 + T2]
+ RotateState
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_sse4(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+; message blocks.
+; L is the message length in SHA512 blocks.
+global sha512_sse4:function
+global _sha512_sse4:function
+sha512_sse4:
+_sha512_sse4:
+ cmp msglen, 0
+ je .nowork
+
+ ; Allocate Stack Space
+ sub rsp, frame_size
+
+ ; Save GPRs
+ mov [rsp + frame.GPRSAVE + 8 * 0], rbx
+ mov [rsp + frame.GPRSAVE + 8 * 1], r12
+ mov [rsp + frame.GPRSAVE + 8 * 2], r13
+ mov [rsp + frame.GPRSAVE + 8 * 3], r14
+ mov [rsp + frame.GPRSAVE + 8 * 4], r15
+%ifdef WINABI
+ mov [rsp + frame.GPRSAVE + 8 * 5], rsi
+ mov [rsp + frame.GPRSAVE + 8 * 6], rdi
+%endif
+
+.updateblock:
+
+ ; Load state variables
+ mov a_64, [DIGEST(0)]
+ mov b_64, [DIGEST(1)]
+ mov c_64, [DIGEST(2)]
+ mov d_64, [DIGEST(3)]
+ mov e_64, [DIGEST(4)]
+ mov f_64, [DIGEST(5)]
+ mov g_64, [DIGEST(6)]
+ mov h_64, [DIGEST(7)]
+
+ %assign t 0
+ %rep 80/2 + 1
+ ; (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ ; +1 iteration because the scheduler leads hashing by 1 iteration
+ %if t < 2
+ ; BSWAP 2 QWORDS
+ movdqa xmm1, [XMM_QWORD_BSWAP wrt rip]
+ movdqu xmm0, [MSG(t)]
+ pshufb xmm0, xmm1 ; BSWAP
+ movdqa [W_t(t)], xmm0 ; Store Scheduled Pair
+ paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ movdqa [WK_2(t)], xmm0 ; Store into WK for rounds
+ %elif t < 16
+ ; BSWAP 2 QWORDS; Compute 2 Rounds
+ movdqu xmm0, [MSG(t)]
+ pshufb xmm0, xmm1 ; BSWAP
+ SHA512_Round t - 2 ; Round t-2
+ movdqa [W_t(t)], xmm0 ; Store Scheduled Pair
+ paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ SHA512_Round t - 1 ; Round t-1
+ movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK
+ %elif t < 79
+ ; Schedule 2 QWORDS; Compute 2 Rounds
+ SHA512_2Sched_2Round_sse t
+ %else
+ ; Compute 2 Rounds
+ SHA512_Round t - 2
+ SHA512_Round t - 1
+ %endif
+ %assign t t+2
+ %endrep
+
+ ; Update digest
+ add [DIGEST(0)], a_64
+ add [DIGEST(1)], b_64
+ add [DIGEST(2)], c_64
+ add [DIGEST(3)], d_64
+ add [DIGEST(4)], e_64
+ add [DIGEST(5)], f_64
+ add [DIGEST(6)], g_64
+ add [DIGEST(7)], h_64
+
+ ; Advance to next message block
+ add msg, 16*8
+ dec msglen
+ jnz .updateblock
+
+ ; Restore GPRs
+ mov rbx, [rsp + frame.GPRSAVE + 8 * 0]
+ mov r12, [rsp + frame.GPRSAVE + 8 * 1]
+ mov r13, [rsp + frame.GPRSAVE + 8 * 2]
+ mov r14, [rsp + frame.GPRSAVE + 8 * 3]
+ mov r15, [rsp + frame.GPRSAVE + 8 * 4]
+%ifdef WINABI
+ mov rsi, [rsp + frame.GPRSAVE + 8 * 5]
+ mov rdi, [rsp + frame.GPRSAVE + 8 * 6]
+%endif
+ ; Restore Stack Pointer
+ add rsp, frame_size
+
+.nowork:
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+
+section .data
+
+ALIGN 16
+
+; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ ddq 0x08090a0b0c0d0e0f0001020304050607
+
+; K[t] used in SHA512 hashing
+K512:
+ dq 0x428a2f98d728ae22,0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538,0x59f111f1b605d019
+ dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242,0x12835b0145706fbe
+ dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235,0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ dq 0x983e5152ee66dfab,0xa831c66d2db43210
+ dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ dq 0x06ca6351e003826f,0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ dq 0x650a73548baf63de,0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6,0x92722c851482353b
+ dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+ dq 0xd192e819d6ef5218,0xd69906245565a910
+ dq 0xf40e35855771202a,0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc,0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ dq 0x90befffa23631e28,0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ dq 0xca273eceea26619c,0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae,0x1b710b35131c471b
+ dq 0x28db77f523047d84,0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Crypto/sha512_sse4_x86.asm b/src/Crypto/sha512_sse4_x86.asm
new file mode 100644
index 00000000..31c8bd0d
--- /dev/null
+++ b/src/Crypto/sha512_sse4_x86.asm
@@ -0,0 +1,10 @@
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/Driver/DriveFilter.c b/src/Driver/DriveFilter.c
index 8195fe35..d46bd92e 100644
--- a/src/Driver/DriveFilter.c
+++ b/src/Driver/DriveFilter.c
@@ -330,7 +330,7 @@ static void ComputeBootLoaderFingerprint(PDEVICE_OBJECT LowerDeviceObject, byte*
#if !defined (_WIN64)
KFLOATING_SAVE floatingPointState;
NTSTATUS saveStatus = STATUS_SUCCESS;
- if (HasISSE())
+ if (HasISSE()|| (HasSSE2() && HasMMX()))
saveStatus = KeSaveFloatingPointState (&floatingPointState);
#endif
WHIRLPOOL_add (ioBuffer, TC_BOOT_SECTOR_PIM_VALUE_OFFSET, &whirlpool);
@@ -368,7 +368,7 @@ static void ComputeBootLoaderFingerprint(PDEVICE_OBJECT LowerDeviceObject, byte*
}
#if !defined (_WIN64)
- if (NT_SUCCESS (saveStatus) && HasISSE())
+ if (NT_SUCCESS (saveStatus) && (HasISSE() || (HasSSE2() && HasMMX())))
KeRestoreFloatingPointState (&floatingPointState);
#endif
}
diff --git a/src/Driver/Driver.vcxproj b/src/Driver/Driver.vcxproj
index a108f426..381d2083 100644
--- a/src/Driver/Driver.vcxproj
+++ b/src/Driver/Driver.vcxproj
@@ -225,6 +225,33 @@ BuildDriver.cmd -rebuild -debug -x64 "$(SolutionDir)\Common" "$(SolutionDir)\Cry
<None Include="..\Crypto\Aes_x86.asm" />
<None Include="..\Crypto\Camellia_aesni_x64.S" />
<None Include="..\Crypto\Camellia_x64.S" />
+ <None Include="..\Crypto\sha256-x86-nayuki.S">
+ <FileType>Document</FileType>
+ </None>
+ <None Include="..\Crypto\sha256_avx1_x64.asm">
+ <FileType>Document</FileType>
+ </None>
+ <None Include="..\Crypto\sha256_avx2_x64.asm">
+ <FileType>Document</FileType>
+ </None>
+ <None Include="..\Crypto\sha256_sse4_x64.asm">
+ <FileType>Document</FileType>
+ </None>
+ <None Include="..\Crypto\sha512-x64-nayuki.S">
+ <FileType>Document</FileType>
+ </None>
+ <None Include="..\Crypto\sha512-x86-nayuki.S">
+ <FileType>Document</FileType>
+ </None>
+ <None Include="..\Crypto\sha512_avx1_x64.asm">
+ <FileType>Document</FileType>
+ </None>
+ <None Include="..\Crypto\sha512_avx2_x64.asm">
+ <FileType>Document</FileType>
+ </None>
+ <None Include="..\Crypto\sha512_sse4_x64.asm">
+ <FileType>Document</FileType>
+ </None>
<None Include="..\Crypto\Twofish_x64.S" />
<None Include="BuildDriver.cmd" />
<None Include="Makefile" />
diff --git a/src/Driver/Driver.vcxproj.filters b/src/Driver/Driver.vcxproj.filters
index 5a44984d..3622c7a8 100644
--- a/src/Driver/Driver.vcxproj.filters
+++ b/src/Driver/Driver.vcxproj.filters
@@ -152,6 +152,33 @@
<None Include="..\Crypto\Twofish_x64.S">
<Filter>Source Files\Crypto</Filter>
</None>
+ <None Include="..\Crypto\sha256-x86-nayuki.S">
+ <Filter>Source Files\Crypto</Filter>
+ </None>
+ <None Include="..\Crypto\sha512_sse4_x64.asm">
+ <Filter>Source Files\Crypto</Filter>
+ </None>
+ <None Include="..\Crypto\sha256_avx1_x64.asm">
+ <Filter>Source Files\Crypto</Filter>
+ </None>
+ <None Include="..\Crypto\sha256_avx2_x64.asm">
+ <Filter>Source Files\Crypto</Filter>
+ </None>
+ <None Include="..\Crypto\sha256_sse4_x64.asm">
+ <Filter>Source Files\Crypto</Filter>
+ </None>
+ <None Include="..\Crypto\sha512-x86-nayuki.S">
+ <Filter>Source Files\Crypto</Filter>
+ </None>
+ <None Include="..\Crypto\sha512-x64-nayuki.S">
+ <Filter>Source Files\Crypto</Filter>
+ </None>
+ <None Include="..\Crypto\sha512_avx1_x64.asm">
+ <Filter>Source Files\Crypto</Filter>
+ </None>
+ <None Include="..\Crypto\sha512_avx2_x64.asm">
+ <Filter>Source Files\Crypto</Filter>
+ </None>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\Common\Apidrvr.h">
diff --git a/src/Volume/Volume.make b/src/Volume/Volume.make
index 0ecc7f42..e179c563 100644
--- a/src/Volume/Volume.make
+++ b/src/Volume/Volume.make
@@ -36,15 +36,32 @@ ifeq "$(PLATFORM)" "MacOSX"
OBJSEX += ../Crypto/Twofish_asm.oo
OBJSEX += ../Crypto/Camellia_asm.oo
OBJSEX += ../Crypto/Camellia_aesni_asm.oo
+ OBJS += ../Crypto/sha256-nayuki.oo
+ OBJS += ../Crypto/sha512-nayuki.oo
+ OBJS += ../Crypto/sha256_avx1.oo
+ OBJS += ../Crypto/sha256_avx2.oo
+ OBJS += ../Crypto/sha256_sse4.oo
+ OBJS += ../Crypto/sha512_avx1.oo
+ OBJS += ../Crypto/sha512_avx2.oo
+ OBJS += ../Crypto/sha512_sse4.oo
else ifeq "$(CPU_ARCH)" "x86"
OBJS += ../Crypto/Aes_x86.o
OBJS += ../Crypto/Aes_hw_cpu.o
+ OBJS += ../Crypto/sha256-x86-nayuki.o
+ OBJS += ../Crypto/sha512-x86-nayuki.o
else ifeq "$(CPU_ARCH)" "x64"
OBJS += ../Crypto/Aes_x64.o
OBJS += ../Crypto/Aes_hw_cpu.o
OBJS += ../Crypto/Twofish_x64.o
OBJS += ../Crypto/Camellia_x64.o
OBJS += ../Crypto/Camellia_aesni_x64.o
+ OBJS += ../Crypto/sha512-x64-nayuki.o
+ OBJS += ../Crypto/sha256_avx1_x64.o
+ OBJS += ../Crypto/sha256_avx2_x64.o
+ OBJS += ../Crypto/sha256_sse4_x64.o
+ OBJS += ../Crypto/sha512_avx1_x64.o
+ OBJS += ../Crypto/sha512_avx2_x64.o
+ OBJS += ../Crypto/sha512_sse4_x64.o
else
OBJS += ../Crypto/Aescrypt.o
endif
@@ -87,6 +104,33 @@ ifeq "$(PLATFORM)" "MacOSX"
../Crypto/Camellia_aesni_asm.oo: ../Crypto/Camellia_aesni_x64.S
@echo Assembling $(<F)
$(YASM) -p gas -f macho64 -o ../Crypto/Camellia_aesni_asm.oo ../Crypto/Camellia_aesni_x64.S
+../Crypto/sha256-nayuki.oo: ../Crypto/sha256-x86-nayuki.S
+ @echo Assembling $(<F)
+ $(YASM) -p gas -f macho32 -o ../Crypto/sha256-nayuki.oo ../Crypto/sha256-x86-nayuki.S
+../Crypto/sha256_avx1.oo: ../Crypto/sha256_avx1_x64.asm
+ @echo Assembling $(<F)
+ $(YASM) -f macho64 -o ../Crypto/sha256_avx1.oo ../Crypto/sha256_avx1_x64.asm
+../Crypto/sha256_avx2.oo: ../Crypto/sha256_avx2_x64.asm
+ @echo Assembling $(<F)
+ $(YASM) -f macho64 -o ../Crypto/sha256_avx2.oo ../Crypto/sha256_avx2_x64.asm
+../Crypto/sha256_sse4.oo: ../Crypto/sha256_sse4_x64.asm
+ @echo Assembling $(<F)
+ $(YASM) -f macho64 -o ../Crypto/sha256_sse4.oo ../Crypto/sha256_sse4_x64.asm
+../Crypto/sha512-nayuki.oo: ../Crypto/sha512-x86-nayuki.S ../Crypto/sha512-x64-nayuki.S
+ @echo Assembling $(<F)
+ $(YASM) -p gas -f macho32 -o ../Crypto/sha512-x86-nayuki.o ../Crypto/sha512-x86-nayuki.S
+ $(YASM) -p gas -f macho64 -o ../Crypto/sha512-x64-nayuki.o ../Crypto/sha512-x64-nayuki.S
+ lipo -create ../Crypto/sha512-x64-nayuki.o ../Crypto/sha512-x64-nayuki.o -output ../Crypto/sha512-nayuki.oo
+ rm -fr ../Crypto/sha512-x86-nayuki.o ../Crypto/sha512-x64-nayuki.o
+../Crypto/sha512_avx1.oo: ../Crypto/sha512_avx1_x64.asm
+ @echo Assembling $(<F)
+ $(YASM) -f macho64 -o ../Crypto/sha512_avx1.oo ../Crypto/sha512_avx1_x64.asm
+../Crypto/sha512_avx2.oo: ../Crypto/sha512_avx2_x64.asm
+ @echo Assembling $(<F)
+ $(YASM) -f macho64 -o ../Crypto/sha512_avx2.oo ../Crypto/sha512_avx2_x64.asm
+../Crypto/sha512_sse4.oo: ../Crypto/sha512_sse4_x64.asm
+ @echo Assembling $(<F)
+ $(YASM) -f macho64 -o ../Crypto/sha512_sse4.oo ../Crypto/sha512_sse4_x64.asm
endif
include $(BUILD_INC)/Makefile.inc