30 files changed, 11276 insertions, 11276 deletions
diff --git a/src/Crypto/Aes.h b/src/Crypto/Aes.h
index 7a1eff47..e12c6fc8 100644
--- a/src/Crypto/Aes.h
+++ b/src/Crypto/Aes.h
@@ -1,215 +1,215 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 20/12/2007
-
- This file contains the definitions required to use AES in C. See aesopt.h
- for optimisation details.
-*/
-
-/* Adapted for TrueCrypt */
-
-#ifndef _AES_H
-#define _AES_H
-
-#include "Common/Tcdefs.h"
-
-#ifndef EXIT_SUCCESS
-#define EXIT_SUCCESS    0
-#define EXIT_FAILURE    1
-#endif
-#define INT_RETURN   int
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-// #define AES_128     /* define if AES with 128 bit keys is needed    */
-// #define AES_192     /* define if AES with 192 bit keys is needed    */
-#define AES_256     /* define if AES with 256 bit keys is needed    */
-// #define AES_VAR     /* define if a variable key size is needed      */
-// #define AES_MODES   /* define if support is needed for modes        */
-
-/* The following must also be set in assembler files if being used  */
-
-#define AES_ENCRYPT /* if support for encryption is needed          */
-#define AES_DECRYPT /* if support for decryption is needed          */
-#define AES_ERR_CHK /* for parameter checks & error return codes    */
-#define AES_REV_DKS /* define to reverse decryption key schedule    */
-
-#define AES_BLOCK_SIZE  16  /* the AES block size in bytes          */
-#define N_COLS           4  /* the number of columns in the state   */
-
-/* The key schedule length is 11, 13 or 15 16-byte blocks for 128,  */
-/* 192 or 256-bit keys respectively. That is 176, 208 or 240 bytes  */
-/* or 44, 52 or 60 32-bit words.                                    */
-
-#if defined( AES_VAR ) || defined( AES_256 )
-#define KS_LENGTH       60
-#elif defined( AES_192 )
-#define KS_LENGTH       52
-#else
-#define KS_LENGTH       44
-#endif
-
-#if defined( AES_ERR_CHK )
-#define AES_RETURN     INT_RETURN
-#else
-#define AES_RETURN     VOID_RETURN
-#endif
-
-/* the character array 'inf' in the following structures is used    */
-/* to hold AES context information. This AES code uses cx->inf.b[0] */
-/* to hold the number of rounds multiplied by 16. The other three   */
-/* elements can be used by code that implements additional modes    */
-
-typedef union
-{   uint_32t l;
-    uint_8t b[4];
-} aes_inf;
-
-typedef struct
-{   uint_32t ks[KS_LENGTH];
-    aes_inf inf;
-} aes_encrypt_ctx;
-
-typedef struct
-{   uint_32t ks[KS_LENGTH];
-    aes_inf inf;
-} aes_decrypt_ctx;
-
-/* This routine must be called before first use if non-static       */
-/* tables are being used                                            */
-
-AES_RETURN aes_init(void);
-
-/* Key lengths in the range 16 <= key_len <= 32 are given in bytes, */
-/* those in the range 128 <= key_len <= 256 are given in bits       */
-
-#if defined( AES_ENCRYPT )
-
-#if defined(AES_128) || defined(AES_VAR)
-AES_RETURN aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]);
-#endif
-
-#if defined(AES_192) || defined(AES_VAR)
-AES_RETURN aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]);
-#endif
-
-#if defined(AES_256) || defined(AES_VAR)
-AES_RETURN aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]);
-#endif
-
-#if defined(AES_VAR)
-AES_RETURN aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]);
-#endif
-
-AES_RETURN aes_encrypt(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1]);
-
-#endif
-
-#if defined( AES_DECRYPT )
-
-#if defined(AES_128) || defined(AES_VAR)
-AES_RETURN aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]);
-#endif
-
-#if defined(AES_192) || defined(AES_VAR)
-AES_RETURN aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]);
-#endif
-
-#if defined(AES_256) || defined(AES_VAR)
-AES_RETURN aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]);
-#endif
-
-#if defined(AES_VAR)
-AES_RETURN aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]);
-#endif
-
-AES_RETURN aes_decrypt(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1]);
-
-#endif
-
-#if defined(AES_MODES)
-
-/* Multiple calls to the following subroutines for multiple block   */
-/* ECB, CBC, CFB, OFB and CTR mode encryption can be used to handle */
-/* long messages incremantally provided that the context AND the iv */
-/* are preserved between all such calls.  For the ECB and CBC modes */
-/* each individual call within a series of incremental calls must   */
-/* process only full blocks (i.e. len must be a multiple of 16) but */
-/* the CFB, OFB and CTR mode calls can handle multiple incremental  */
-/* calls of any length. Each mode is reset when a new AES key is    */
-/* set but ECB and CBC operations can be reset without setting a    */
-/* new key by setting a new IV value.  To reset CFB, OFB and CTR    */
-/* without setting the key, aes_mode_reset() must be called and the */
-/* IV must be set.  NOTE: All these calls update the IV on exit so  */
-/* this has to be reset if a new operation with the same IV as the  */
-/* previous one is required (or decryption follows encryption with  */
-/* the same IV array).                                              */
-
-AES_RETURN aes_test_alignment_detection(unsigned int n);
-
-AES_RETURN aes_ecb_encrypt(const unsigned char *ibuf, unsigned char *obuf,
-                    int len, const aes_encrypt_ctx cx[1]);
-
-AES_RETURN aes_ecb_decrypt(const unsigned char *ibuf, unsigned char *obuf,
-                    int len, const aes_decrypt_ctx cx[1]);
-
-AES_RETURN aes_cbc_encrypt(const unsigned char *ibuf, unsigned char *obuf,
-                    int len, unsigned char *iv, const aes_encrypt_ctx cx[1]);
-
-AES_RETURN aes_cbc_decrypt(const unsigned char *ibuf, unsigned char *obuf,
-                    int len, unsigned char *iv, const aes_decrypt_ctx cx[1]);
-
-AES_RETURN aes_mode_reset(aes_encrypt_ctx cx[1]);
-
-AES_RETURN aes_cfb_encrypt(const unsigned char *ibuf, unsigned char *obuf,
-                    int len, unsigned char *iv, aes_encrypt_ctx cx[1]);
-
-AES_RETURN aes_cfb_decrypt(const unsigned char *ibuf, unsigned char *obuf,
-                    int len, unsigned char *iv, aes_encrypt_ctx cx[1]);
-
-#define aes_ofb_encrypt aes_ofb_crypt
-#define aes_ofb_decrypt aes_ofb_crypt
-
-AES_RETURN aes_ofb_crypt(const unsigned char *ibuf, unsigned char *obuf,
-                    int len, unsigned char *iv, aes_encrypt_ctx cx[1]);
-
-typedef void cbuf_inc(unsigned char *cbuf);
-
-#define aes_ctr_encrypt aes_ctr_crypt
-#define aes_ctr_decrypt aes_ctr_crypt
-
-AES_RETURN aes_ctr_crypt(const unsigned char *ibuf, unsigned char *obuf,
-            int len, unsigned char *cbuf, cbuf_inc ctr_inc, aes_encrypt_ctx cx[1]);
-
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+
+ This file contains the definitions required to use AES in C. See aesopt.h
+ for optimisation details.
+*/
+
+/* Adapted for TrueCrypt */
+
+#ifndef _AES_H
+#define _AES_H
+
+#include "Common/Tcdefs.h"
+
+#ifndef EXIT_SUCCESS
+#define EXIT_SUCCESS    0
+#define EXIT_FAILURE    1
+#endif
+#define INT_RETURN   int
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+// #define AES_128     /* define if AES with 128 bit keys is needed    */
+// #define AES_192     /* define if AES with 192 bit keys is needed    */
+#define AES_256     /* define if AES with 256 bit keys is needed    */
+// #define AES_VAR     /* define if a variable key size is needed      */
+// #define AES_MODES   /* define if support is needed for modes        */
+
+/* The following must also be set in assembler files if being used  */
+
+#define AES_ENCRYPT /* if support for encryption is needed          */
+#define AES_DECRYPT /* if support for decryption is needed          */
+#define AES_ERR_CHK /* for parameter checks & error return codes    */
+#define AES_REV_DKS /* define to reverse decryption key schedule    */
+
+#define AES_BLOCK_SIZE  16  /* the AES block size in bytes          */
+#define N_COLS           4  /* the number of columns in the state   */
+
+/* The key schedule length is 11, 13 or 15 16-byte blocks for 128,  */
+/* 192 or 256-bit keys respectively. That is 176, 208 or 240 bytes  */
+/* or 44, 52 or 60 32-bit words.                                    */
+
+#if defined( AES_VAR ) || defined( AES_256 )
+#define KS_LENGTH       60
+#elif defined( AES_192 )
+#define KS_LENGTH       52
+#else
+#define KS_LENGTH       44
+#endif
+
+#if defined( AES_ERR_CHK )
+#define AES_RETURN     INT_RETURN
+#else
+#define AES_RETURN     VOID_RETURN
+#endif
+
+/* the character array 'inf' in the following structures is used    */
+/* to hold AES context information. This AES code uses cx->inf.b[0] */
+/* to hold the number of rounds multiplied by 16. The other three   */
+/* elements can be used by code that implements additional modes    */
+
+typedef union
+{   uint_32t l;
+    uint_8t b[4];
+} aes_inf;
+
+typedef struct
+{   uint_32t ks[KS_LENGTH];
+    aes_inf inf;
+} aes_encrypt_ctx;
+
+typedef struct
+{   uint_32t ks[KS_LENGTH];
+    aes_inf inf;
+} aes_decrypt_ctx;
+
+/* This routine must be called before first use if non-static       */
+/* tables are being used                                            */
+
+AES_RETURN aes_init(void);
+
+/* Key lengths in the range 16 <= key_len <= 32 are given in bytes, */
+/* those in the range 128 <= key_len <= 256 are given in bits       */
+
+#if defined( AES_ENCRYPT )
+
+#if defined(AES_128) || defined(AES_VAR)
+AES_RETURN aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]);
+#endif
+
+#if defined(AES_192) || defined(AES_VAR)
+AES_RETURN aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]);
+#endif
+
+#if defined(AES_256) || defined(AES_VAR)
+AES_RETURN aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]);
+#endif
+
+#if defined(AES_VAR)
+AES_RETURN aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]);
+#endif
+
+AES_RETURN aes_encrypt(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1]);
+
+#endif
+
+#if defined( AES_DECRYPT )
+
+#if defined(AES_128) || defined(AES_VAR)
+AES_RETURN aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]);
+#endif
+
+#if defined(AES_192) || defined(AES_VAR)
+AES_RETURN aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]);
+#endif
+
+#if defined(AES_256) || defined(AES_VAR)
+AES_RETURN aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]);
+#endif
+
+#if defined(AES_VAR)
+AES_RETURN aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]);
+#endif
+
+AES_RETURN aes_decrypt(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1]);
+
+#endif
+
+#if defined(AES_MODES)
+
+/* Multiple calls to the following subroutines for multiple block   */
+/* ECB, CBC, CFB, OFB and CTR mode encryption can be used to handle */
+/* long messages incremantally provided that the context AND the iv */
+/* are preserved between all such calls.  For the ECB and CBC modes */
+/* each individual call within a series of incremental calls must   */
+/* process only full blocks (i.e. len must be a multiple of 16) but */
+/* the CFB, OFB and CTR mode calls can handle multiple incremental  */
+/* calls of any length. Each mode is reset when a new AES key is    */
+/* set but ECB and CBC operations can be reset without setting a    */
+/* new key by setting a new IV value.  To reset CFB, OFB and CTR    */
+/* without setting the key, aes_mode_reset() must be called and the */
+/* IV must be set.  NOTE: All these calls update the IV on exit so  */
+/* this has to be reset if a new operation with the same IV as the  */
+/* previous one is required (or decryption follows encryption with  */
+/* the same IV array).                                              */
+
+AES_RETURN aes_test_alignment_detection(unsigned int n);
+
+AES_RETURN aes_ecb_encrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, const aes_encrypt_ctx cx[1]);
+
+AES_RETURN aes_ecb_decrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, const aes_decrypt_ctx cx[1]);
+
+AES_RETURN aes_cbc_encrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, const aes_encrypt_ctx cx[1]);
+
+AES_RETURN aes_cbc_decrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, const aes_decrypt_ctx cx[1]);
+
+AES_RETURN aes_mode_reset(aes_encrypt_ctx cx[1]);
+
+AES_RETURN aes_cfb_encrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, aes_encrypt_ctx cx[1]);
+
+AES_RETURN aes_cfb_decrypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, aes_encrypt_ctx cx[1]);
+
+#define aes_ofb_encrypt aes_ofb_crypt
+#define aes_ofb_decrypt aes_ofb_crypt
+
+AES_RETURN aes_ofb_crypt(const unsigned char *ibuf, unsigned char *obuf,
+                    int len, unsigned char *iv, aes_encrypt_ctx cx[1]);
+
+typedef void cbuf_inc(unsigned char *cbuf);
+
+#define aes_ctr_encrypt aes_ctr_crypt
+#define aes_ctr_decrypt aes_ctr_crypt
+
+AES_RETURN aes_ctr_crypt(const unsigned char *ibuf, unsigned char *obuf,
+            int len, unsigned char *cbuf, cbuf_inc ctr_inc, aes_encrypt_ctx cx[1]);
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/src/Crypto/AesSmall.c b/src/Crypto/AesSmall.c
index 91c89873..10e7cf83 100644
--- a/src/Crypto/AesSmall.c
+++ b/src/Crypto/AesSmall.c
@@ -1,953 +1,953 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue 09/09/2006
-
- This is an AES implementation that uses only 8-bit byte operations on the
- cipher state (there are options to use 32-bit types if available).
-
- The combination of mix columns and byte substitution used here is based on
- that developed by Karl Malbrain. His contribution is acknowledged.
- */
-
-/* Adapted for TrueCrypt:
-  - Macro-generated tables were replaced with static data to enable compiling
-    with MSVC++ 1.5 which runs out of resources when expanding large macros.
-*/
-
-#pragma optimize ("t", on)
-
-/* define if you have a fast memcpy function on your system */
-#if 1
-#  define HAVE_MEMCPY
-#  include <string.h>
-#  if defined( _MSC_VER )
-#    ifndef DEBUG
-#      pragma intrinsic( memcpy )
-#    endif
-#  endif
-#endif
-
-/* define if you have fast 32-bit types on your system */
-#if 1
-#  define HAVE_UINT_32T
-#endif
-
-/* alternative versions (test for performance on your system) */
-#if 0
-#  define VERSION_1
-#endif
-
-#include "AesSmall.h"
-
-#define WPOLY   0x011b
-#define DPOLY   0x008d
-#define f1(x)   (x)
-#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
-#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
-#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) \
-                        ^ (((x>>5) & 4) * WPOLY))
-#define d2(x)   (((x) >> 1) ^ ((x) & 1 ? DPOLY : 0))
-
-#define f3(x)   (f2(x) ^ x)
-#define f9(x)   (f8(x) ^ x)
-#define fb(x)   (f8(x) ^ f2(x) ^ x)
-#define fd(x)   (f8(x) ^ f4(x) ^ x)
-#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
-
-static const uint_8t s_box[256] = {
-	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,
-	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
-	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,
-	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
-	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,
-	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
-	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,
-	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
-	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,
-	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
-	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,
-	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
-	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,
-	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
-	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,
-	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
-	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,
-	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
-	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,
-	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
-	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,
-	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
-	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,
-	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
-	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,
-	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
-	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,
-	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
-	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,
-	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
-	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,
-	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
-};
-
-static const uint_8t inv_s_box[256] = {
-	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38,
-	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb,
-	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87,
-	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb,
-	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d,
-	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e,
-	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2,
-	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25,
-	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16,
-	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92,
-	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda,
-	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84,
-	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a,
-	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06,
-	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02,
-	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b,
-	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea,
-	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73,
-	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85,
-	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e,
-	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89,
-	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b,
-	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20,
-	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4,
-	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31,
-	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f,
-	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d,
-	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef,
-	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0,
-	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61,
-	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26,
-	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
-};
-
-static const uint_8t gfm2_s_box[256] = {
-	0xc6,0xf8,0xee,0xf6,0xff,0xd6,0xde,0x91,
-	0x60,0x02,0xce,0x56,0xe7,0xb5,0x4d,0xec,
-	0x8f,0x1f,0x89,0xfa,0xef,0xb2,0x8e,0xfb,
-	0x41,0xb3,0x5f,0x45,0x23,0x53,0xe4,0x9b,
-	0x75,0xe1,0x3d,0x4c,0x6c,0x7e,0xf5,0x83,
-	0x68,0x51,0xd1,0xf9,0xe2,0xab,0x62,0x2a,
-	0x08,0x95,0x46,0x9d,0x30,0x37,0x0a,0x2f,
-	0x0e,0x24,0x1b,0xdf,0xcd,0x4e,0x7f,0xea,
-	0x12,0x1d,0x58,0x34,0x36,0xdc,0xb4,0x5b,
-	0xa4,0x76,0xb7,0x7d,0x52,0xdd,0x5e,0x13,
-	0xa6,0xb9,0x00,0xc1,0x40,0xe3,0x79,0xb6,
-	0xd4,0x8d,0x67,0x72,0x94,0x98,0xb0,0x85,
-	0xbb,0xc5,0x4f,0xed,0x86,0x9a,0x66,0x11,
-	0x8a,0xe9,0x04,0xfe,0xa0,0x78,0x25,0x4b,
-	0xa2,0x5d,0x80,0x05,0x3f,0x21,0x70,0xf1,
-	0x63,0x77,0xaf,0x42,0x20,0xe5,0xfd,0xbf,
-	0x81,0x18,0x26,0xc3,0xbe,0x35,0x88,0x2e,
-	0x93,0x55,0xfc,0x7a,0xc8,0xba,0x32,0xe6,
-	0xc0,0x19,0x9e,0xa3,0x44,0x54,0x3b,0x0b,
-	0x8c,0xc7,0x6b,0x28,0xa7,0xbc,0x16,0xad,
-	0xdb,0x64,0x74,0x14,0x92,0x0c,0x48,0xb8,
-	0x9f,0xbd,0x43,0xc4,0x39,0x31,0xd3,0xf2,
-	0xd5,0x8b,0x6e,0xda,0x01,0xb1,0x9c,0x49,
-	0xd8,0xac,0xf3,0xcf,0xca,0xf4,0x47,0x10,
-	0x6f,0xf0,0x4a,0x5c,0x38,0x57,0x73,0x97,
-	0xcb,0xa1,0xe8,0x3e,0x96,0x61,0x0d,0x0f,
-	0xe0,0x7c,0x71,0xcc,0x90,0x06,0xf7,0x1c,
-	0xc2,0x6a,0xae,0x69,0x17,0x99,0x3a,0x27,
-	0xd9,0xeb,0x2b,0x22,0xd2,0xa9,0x07,0x33,
-	0x2d,0x3c,0x15,0xc9,0x87,0xaa,0x50,0xa5,
-	0x03,0x59,0x09,0x1a,0x65,0xd7,0x84,0xd0,
-	0x82,0x29,0x5a,0x1e,0x7b,0xa8,0x6d,0x2c
-};
-
-static const uint_8t gfm3_s_box[256] = {
-	0xa5,0x84,0x99,0x8d,0x0d,0xbd,0xb1,0x54,
-	0x50,0x03,0xa9,0x7d,0x19,0x62,0xe6,0x9a,
-	0x45,0x9d,0x40,0x87,0x15,0xeb,0xc9,0x0b,
-	0xec,0x67,0xfd,0xea,0xbf,0xf7,0x96,0x5b,
-	0xc2,0x1c,0xae,0x6a,0x5a,0x41,0x02,0x4f,
-	0x5c,0xf4,0x34,0x08,0x93,0x73,0x53,0x3f,
-	0x0c,0x52,0x65,0x5e,0x28,0xa1,0x0f,0xb5,
-	0x09,0x36,0x9b,0x3d,0x26,0x69,0xcd,0x9f,
-	0x1b,0x9e,0x74,0x2e,0x2d,0xb2,0xee,0xfb,
-	0xf6,0x4d,0x61,0xce,0x7b,0x3e,0x71,0x97,
-	0xf5,0x68,0x00,0x2c,0x60,0x1f,0xc8,0xed,
-	0xbe,0x46,0xd9,0x4b,0xde,0xd4,0xe8,0x4a,
-	0x6b,0x2a,0xe5,0x16,0xc5,0xd7,0x55,0x94,
-	0xcf,0x10,0x06,0x81,0xf0,0x44,0xba,0xe3,
-	0xf3,0xfe,0xc0,0x8a,0xad,0xbc,0x48,0x04,
-	0xdf,0xc1,0x75,0x63,0x30,0x1a,0x0e,0x6d,
-	0x4c,0x14,0x35,0x2f,0xe1,0xa2,0xcc,0x39,
-	0x57,0xf2,0x82,0x47,0xac,0xe7,0x2b,0x95,
-	0xa0,0x98,0xd1,0x7f,0x66,0x7e,0xab,0x83,
-	0xca,0x29,0xd3,0x3c,0x79,0xe2,0x1d,0x76,
-	0x3b,0x56,0x4e,0x1e,0xdb,0x0a,0x6c,0xe4,
-	0x5d,0x6e,0xef,0xa6,0xa8,0xa4,0x37,0x8b,
-	0x32,0x43,0x59,0xb7,0x8c,0x64,0xd2,0xe0,
-	0xb4,0xfa,0x07,0x25,0xaf,0x8e,0xe9,0x18,
-	0xd5,0x88,0x6f,0x72,0x24,0xf1,0xc7,0x51,
-	0x23,0x7c,0x9c,0x21,0xdd,0xdc,0x86,0x85,
-	0x90,0x42,0xc4,0xaa,0xd8,0x05,0x01,0x12,
-	0xa3,0x5f,0xf9,0xd0,0x91,0x58,0x27,0xb9,
-	0x38,0x13,0xb3,0x33,0xbb,0x70,0x89,0xa7,
-	0xb6,0x22,0x92,0x20,0x49,0xff,0x78,0x7a,
-	0x8f,0xf8,0x80,0x17,0xda,0x31,0xc6,0xb8,
-	0xc3,0xb0,0x77,0x11,0xcb,0xfc,0xd6,0x3a
-};
-
-static const uint_8t gfmul_9[256] = {
-	0x00,0x09,0x12,0x1b,0x24,0x2d,0x36,0x3f,
-	0x48,0x41,0x5a,0x53,0x6c,0x65,0x7e,0x77,
-	0x90,0x99,0x82,0x8b,0xb4,0xbd,0xa6,0xaf,
-	0xd8,0xd1,0xca,0xc3,0xfc,0xf5,0xee,0xe7,
-	0x3b,0x32,0x29,0x20,0x1f,0x16,0x0d,0x04,
-	0x73,0x7a,0x61,0x68,0x57,0x5e,0x45,0x4c,
-	0xab,0xa2,0xb9,0xb0,0x8f,0x86,0x9d,0x94,
-	0xe3,0xea,0xf1,0xf8,0xc7,0xce,0xd5,0xdc,
-	0x76,0x7f,0x64,0x6d,0x52,0x5b,0x40,0x49,
-	0x3e,0x37,0x2c,0x25,0x1a,0x13,0x08,0x01,
-	0xe6,0xef,0xf4,0xfd,0xc2,0xcb,0xd0,0xd9,
-	0xae,0xa7,0xbc,0xb5,0x8a,0x83,0x98,0x91,
-	0x4d,0x44,0x5f,0x56,0x69,0x60,0x7b,0x72,
-	0x05,0x0c,0x17,0x1e,0x21,0x28,0x33,0x3a,
-	0xdd,0xd4,0xcf,0xc6,0xf9,0xf0,0xeb,0xe2,
-	0x95,0x9c,0x87,0x8e,0xb1,0xb8,0xa3,0xaa,
-	0xec,0xe5,0xfe,0xf7,0xc8,0xc1,0xda,0xd3,
-	0xa4,0xad,0xb6,0xbf,0x80,0x89,0x92,0x9b,
-	0x7c,0x75,0x6e,0x67,0x58,0x51,0x4a,0x43,
-	0x34,0x3d,0x26,0x2f,0x10,0x19,0x02,0x0b,
-	0xd7,0xde,0xc5,0xcc,0xf3,0xfa,0xe1,0xe8,
-	0x9f,0x96,0x8d,0x84,0xbb,0xb2,0xa9,0xa0,
-	0x47,0x4e,0x55,0x5c,0x63,0x6a,0x71,0x78,
-	0x0f,0x06,0x1d,0x14,0x2b,0x22,0x39,0x30,
-	0x9a,0x93,0x88,0x81,0xbe,0xb7,0xac,0xa5,
-	0xd2,0xdb,0xc0,0xc9,0xf6,0xff,0xe4,0xed,
-	0x0a,0x03,0x18,0x11,0x2e,0x27,0x3c,0x35,
-	0x42,0x4b,0x50,0x59,0x66,0x6f,0x74,0x7d,
-	0xa1,0xa8,0xb3,0xba,0x85,0x8c,0x97,0x9e,
-	0xe9,0xe0,0xfb,0xf2,0xcd,0xc4,0xdf,0xd6,
-	0x31,0x38,0x23,0x2a,0x15,0x1c,0x07,0x0e,
-	0x79,0x70,0x6b,0x62,0x5d,0x54,0x4f,0x46
-};
-
-static const uint_8t gfmul_b[256] = {
-	0x00,0x0b,0x16,0x1d,0x2c,0x27,0x3a,0x31,
-	0x58,0x53,0x4e,0x45,0x74,0x7f,0x62,0x69,
-	0xb0,0xbb,0xa6,0xad,0x9c,0x97,0x8a,0x81,
-	0xe8,0xe3,0xfe,0xf5,0xc4,0xcf,0xd2,0xd9,
-	0x7b,0x70,0x6d,0x66,0x57,0x5c,0x41,0x4a,
-	0x23,0x28,0x35,0x3e,0x0f,0x04,0x19,0x12,
-	0xcb,0xc0,0xdd,0xd6,0xe7,0xec,0xf1,0xfa,
-	0x93,0x98,0x85,0x8e,0xbf,0xb4,0xa9,0xa2,
-	0xf6,0xfd,0xe0,0xeb,0xda,0xd1,0xcc,0xc7,
-	0xae,0xa5,0xb8,0xb3,0x82,0x89,0x94,0x9f,
-	0x46,0x4d,0x50,0x5b,0x6a,0x61,0x7c,0x77,
-	0x1e,0x15,0x08,0x03,0x32,0x39,0x24,0x2f,
-	0x8d,0x86,0x9b,0x90,0xa1,0xaa,0xb7,0xbc,
-	0xd5,0xde,0xc3,0xc8,0xf9,0xf2,0xef,0xe4,
-	0x3d,0x36,0x2b,0x20,0x11,0x1a,0x07,0x0c,
-	0x65,0x6e,0x73,0x78,0x49,0x42,0x5f,0x54,
-	0xf7,0xfc,0xe1,0xea,0xdb,0xd0,0xcd,0xc6,
-	0xaf,0xa4,0xb9,0xb2,0x83,0x88,0x95,0x9e,
-	0x47,0x4c,0x51,0x5a,0x6b,0x60,0x7d,0x76,
-	0x1f,0x14,0x09,0x02,0x33,0x38,0x25,0x2e,
-	0x8c,0x87,0x9a,0x91,0xa0,0xab,0xb6,0xbd,
-	0xd4,0xdf,0xc2,0xc9,0xf8,0xf3,0xee,0xe5,
-	0x3c,0x37,0x2a,0x21,0x10,0x1b,0x06,0x0d,
-	0x64,0x6f,0x72,0x79,0x48,0x43,0x5e,0x55,
-	0x01,0x0a,0x17,0x1c,0x2d,0x26,0x3b,0x30,
-	0x59,0x52,0x4f,0x44,0x75,0x7e,0x63,0x68,
-	0xb1,0xba,0xa7,0xac,0x9d,0x96,0x8b,0x80,
-	0xe9,0xe2,0xff,0xf4,0xc5,0xce,0xd3,0xd8,
-	0x7a,0x71,0x6c,0x67,0x56,0x5d,0x40,0x4b,
-	0x22,0x29,0x34,0x3f,0x0e,0x05,0x18,0x13,
-	0xca,0xc1,0xdc,0xd7,0xe6,0xed,0xf0,0xfb,
-	0x92,0x99,0x84,0x8f,0xbe,0xb5,0xa8,0xa3
-};
-
-static const uint_8t gfmul_d[256] = {
-	0x00,0x0d,0x1a,0x17,0x34,0x39,0x2e,0x23,
-	0x68,0x65,0x72,0x7f,0x5c,0x51,0x46,0x4b,
-	0xd0,0xdd,0xca,0xc7,0xe4,0xe9,0xfe,0xf3,
-	0xb8,0xb5,0xa2,0xaf,0x8c,0x81,0x96,0x9b,
-	0xbb,0xb6,0xa1,0xac,0x8f,0x82,0x95,0x98,
-	0xd3,0xde,0xc9,0xc4,0xe7,0xea,0xfd,0xf0,
-	0x6b,0x66,0x71,0x7c,0x5f,0x52,0x45,0x48,
-	0x03,0x0e,0x19,0x14,0x37,0x3a,0x2d,0x20,
-	0x6d,0x60,0x77,0x7a,0x59,0x54,0x43,0x4e,
-	0x05,0x08,0x1f,0x12,0x31,0x3c,0x2b,0x26,
-	0xbd,0xb0,0xa7,0xaa,0x89,0x84,0x93,0x9e,
-	0xd5,0xd8,0xcf,0xc2,0xe1,0xec,0xfb,0xf6,
-	0xd6,0xdb,0xcc,0xc1,0xe2,0xef,0xf8,0xf5,
-	0xbe,0xb3,0xa4,0xa9,0x8a,0x87,0x90,0x9d,
-	0x06,0x0b,0x1c,0x11,0x32,0x3f,0x28,0x25,
-	0x6e,0x63,0x74,0x79,0x5a,0x57,0x40,0x4d,
-	0xda,0xd7,0xc0,0xcd,0xee,0xe3,0xf4,0xf9,
-	0xb2,0xbf,0xa8,0xa5,0x86,0x8b,0x9c,0x91,
-	0x0a,0x07,0x10,0x1d,0x3e,0x33,0x24,0x29,
-	0x62,0x6f,0x78,0x75,0x56,0x5b,0x4c,0x41,
-	0x61,0x6c,0x7b,0x76,0x55,0x58,0x4f,0x42,
-	0x09,0x04,0x13,0x1e,0x3d,0x30,0x27,0x2a,
-	0xb1,0xbc,0xab,0xa6,0x85,0x88,0x9f,0x92,
-	0xd9,0xd4,0xc3,0xce,0xed,0xe0,0xf7,0xfa,
-	0xb7,0xba,0xad,0xa0,0x83,0x8e,0x99,0x94,
-	0xdf,0xd2,0xc5,0xc8,0xeb,0xe6,0xf1,0xfc,
-	0x67,0x6a,0x7d,0x70,0x53,0x5e,0x49,0x44,
-	0x0f,0x02,0x15,0x18,0x3b,0x36,0x21,0x2c,
-	0x0c,0x01,0x16,0x1b,0x38,0x35,0x22,0x2f,
-	0x64,0x69,0x7e,0x73,0x50,0x5d,0x4a,0x47,
-	0xdc,0xd1,0xc6,0xcb,0xe8,0xe5,0xf2,0xff,
-	0xb4,0xb9,0xae,0xa3,0x80,0x8d,0x9a,0x97
-};
-
-static const uint_8t gfmul_e[256] = {
-	0x00,0x0e,0x1c,0x12,0x38,0x36,0x24,0x2a,
-	0x70,0x7e,0x6c,0x62,0x48,0x46,0x54,0x5a,
-	0xe0,0xee,0xfc,0xf2,0xd8,0xd6,0xc4,0xca,
-	0x90,0x9e,0x8c,0x82,0xa8,0xa6,0xb4,0xba,
-	0xdb,0xd5,0xc7,0xc9,0xe3,0xed,0xff,0xf1,
-	0xab,0xa5,0xb7,0xb9,0x93,0x9d,0x8f,0x81,
-	0x3b,0x35,0x27,0x29,0x03,0x0d,0x1f,0x11,
-	0x4b,0x45,0x57,0x59,0x73,0x7d,0x6f,0x61,
-	0xad,0xa3,0xb1,0xbf,0x95,0x9b,0x89,0x87,
-	0xdd,0xd3,0xc1,0xcf,0xe5,0xeb,0xf9,0xf7,
-	0x4d,0x43,0x51,0x5f,0x75,0x7b,0x69,0x67,
-	0x3d,0x33,0x21,0x2f,0x05,0x0b,0x19,0x17,
-	0x76,0x78,0x6a,0x64,0x4e,0x40,0x52,0x5c,
-	0x06,0x08,0x1a,0x14,0x3e,0x30,0x22,0x2c,
-	0x96,0x98,0x8a,0x84,0xae,0xa0,0xb2,0xbc,
-	0xe6,0xe8,0xfa,0xf4,0xde,0xd0,0xc2,0xcc,
-	0x41,0x4f,0x5d,0x53,0x79,0x77,0x65,0x6b,
-	0x31,0x3f,0x2d,0x23,0x09,0x07,0x15,0x1b,
-	0xa1,0xaf,0xbd,0xb3,0x99,0x97,0x85,0x8b,
-	0xd1,0xdf,0xcd,0xc3,0xe9,0xe7,0xf5,0xfb,
-	0x9a,0x94,0x86,0x88,0xa2,0xac,0xbe,0xb0,
-	0xea,0xe4,0xf6,0xf8,0xd2,0xdc,0xce,0xc0,
-	0x7a,0x74,0x66,0x68,0x42,0x4c,0x5e,0x50,
-	0x0a,0x04,0x16,0x18,0x32,0x3c,0x2e,0x20,
-	0xec,0xe2,0xf0,0xfe,0xd4,0xda,0xc8,0xc6,
-	0x9c,0x92,0x80,0x8e,0xa4,0xaa,0xb8,0xb6,
-	0x0c,0x02,0x10,0x1e,0x34,0x3a,0x28,0x26,
-	0x7c,0x72,0x60,0x6e,0x44,0x4a,0x58,0x56,
-	0x37,0x39,0x2b,0x25,0x0f,0x01,0x13,0x1d,
-	0x47,0x49,0x5b,0x55,0x7f,0x71,0x63,0x6d,
-	0xd7,0xd9,0xcb,0xc5,0xef,0xe1,0xf3,0xfd,
-	0xa7,0xa9,0xbb,0xb5,0x9f,0x91,0x83,0x8d
-};
-
-#if defined( HAVE_UINT_32T )
-  typedef unsigned long uint_32t;
-#endif
-
-#if defined( HAVE_MEMCPY )
-#  define block_copy(d, s, l) memcpy(d, s, l)
-#  define block16_copy(d, s)  memcpy(d, s, N_BLOCK)
-#else
-#  define block_copy(d, s, l) copy_block(d, s, l)
-#  define block16_copy(d, s)  copy_block16(d, s)
-#endif
-
-/* block size 'nn' must be a multiple of four */
-
-static void copy_block16( void *d, const void *s )
-{
-#if defined( HAVE_UINT_32T )
-    ((uint_32t*)d)[ 0] = ((uint_32t*)s)[ 0];
-    ((uint_32t*)d)[ 1] = ((uint_32t*)s)[ 1];
-    ((uint_32t*)d)[ 2] = ((uint_32t*)s)[ 2];
-    ((uint_32t*)d)[ 3] = ((uint_32t*)s)[ 3];
-#else
-    ((uint_8t*)d)[ 0] = ((uint_8t*)s)[ 0];
-    ((uint_8t*)d)[ 1] = ((uint_8t*)s)[ 1];
-    ((uint_8t*)d)[ 2] = ((uint_8t*)s)[ 2];
-    ((uint_8t*)d)[ 3] = ((uint_8t*)s)[ 3];
-    ((uint_8t*)d)[ 4] = ((uint_8t*)s)[ 4];
-    ((uint_8t*)d)[ 5] = ((uint_8t*)s)[ 5];
-    ((uint_8t*)d)[ 6] = ((uint_8t*)s)[ 6];
-    ((uint_8t*)d)[ 7] = ((uint_8t*)s)[ 7];
-    ((uint_8t*)d)[ 8] = ((uint_8t*)s)[ 8];
-    ((uint_8t*)d)[ 9] = ((uint_8t*)s)[ 9];
-    ((uint_8t*)d)[10] = ((uint_8t*)s)[10];
-    ((uint_8t*)d)[11] = ((uint_8t*)s)[11];
-    ((uint_8t*)d)[12] = ((uint_8t*)s)[12];
-    ((uint_8t*)d)[13] = ((uint_8t*)s)[13];
-    ((uint_8t*)d)[14] = ((uint_8t*)s)[14];
-    ((uint_8t*)d)[15] = ((uint_8t*)s)[15];
-#endif
-}
-
-static void copy_block( void * d, void *s, uint_8t nn )
-{
-    while( nn-- )
-        *((uint_8t*)d)++ = *((uint_8t*)s)++;
-}
-
-static void xor_block( void *d, const void *s )
-{
-#if defined( HAVE_UINT_32T )
-    ((uint_32t*)d)[ 0] ^= ((uint_32t*)s)[ 0];
-    ((uint_32t*)d)[ 1] ^= ((uint_32t*)s)[ 1];
-    ((uint_32t*)d)[ 2] ^= ((uint_32t*)s)[ 2];
-    ((uint_32t*)d)[ 3] ^= ((uint_32t*)s)[ 3];
-#else
-    ((uint_8t*)d)[ 0] ^= ((uint_8t*)s)[ 0];
-    ((uint_8t*)d)[ 1] ^= ((uint_8t*)s)[ 1];
-    ((uint_8t*)d)[ 2] ^= ((uint_8t*)s)[ 2];
-    ((uint_8t*)d)[ 3] ^= ((uint_8t*)s)[ 3];
-    ((uint_8t*)d)[ 4] ^= ((uint_8t*)s)[ 4];
-    ((uint_8t*)d)[ 5] ^= ((uint_8t*)s)[ 5];
-    ((uint_8t*)d)[ 6] ^= ((uint_8t*)s)[ 6];
-    ((uint_8t*)d)[ 7] ^= ((uint_8t*)s)[ 7];
-    ((uint_8t*)d)[ 8] ^= ((uint_8t*)s)[ 8];
-    ((uint_8t*)d)[ 9] ^= ((uint_8t*)s)[ 9];
-    ((uint_8t*)d)[10] ^= ((uint_8t*)s)[10];
-    ((uint_8t*)d)[11] ^= ((uint_8t*)s)[11];
-    ((uint_8t*)d)[12] ^= ((uint_8t*)s)[12];
-    ((uint_8t*)d)[13] ^= ((uint_8t*)s)[13];
-    ((uint_8t*)d)[14] ^= ((uint_8t*)s)[14];
-    ((uint_8t*)d)[15] ^= ((uint_8t*)s)[15];
-#endif
-}
-
-static void copy_and_key( void *d, const void *s, const void *k )
-{
-#if defined( HAVE_UINT_32T )
-    ((uint_32t*)d)[ 0] = ((uint_32t*)s)[ 0] ^ ((uint_32t*)k)[ 0];
-    ((uint_32t*)d)[ 1] = ((uint_32t*)s)[ 1] ^ ((uint_32t*)k)[ 1];
-    ((uint_32t*)d)[ 2] = ((uint_32t*)s)[ 2] ^ ((uint_32t*)k)[ 2];
-    ((uint_32t*)d)[ 3] = ((uint_32t*)s)[ 3] ^ ((uint_32t*)k)[ 3];
-#elif 1
-    ((uint_8t*)d)[ 0] = ((uint_8t*)s)[ 0] ^ ((uint_8t*)k)[ 0];
-    ((uint_8t*)d)[ 1] = ((uint_8t*)s)[ 1] ^ ((uint_8t*)k)[ 1];
-    ((uint_8t*)d)[ 2] = ((uint_8t*)s)[ 2] ^ ((uint_8t*)k)[ 2];
-    ((uint_8t*)d)[ 3] = ((uint_8t*)s)[ 3] ^ ((uint_8t*)k)[ 3];
-    ((uint_8t*)d)[ 4] = ((uint_8t*)s)[ 4] ^ ((uint_8t*)k)[ 4];
-    ((uint_8t*)d)[ 5] = ((uint_8t*)s)[ 5] ^ ((uint_8t*)k)[ 5];
-    ((uint_8t*)d)[ 6] = ((uint_8t*)s)[ 6] ^ ((uint_8t*)k)[ 6];
-    ((uint_8t*)d)[ 7] = ((uint_8t*)s)[ 7] ^ ((uint_8t*)k)[ 7];
-    ((uint_8t*)d)[ 8] = ((uint_8t*)s)[ 8] ^ ((uint_8t*)k)[ 8];
-    ((uint_8t*)d)[ 9] = ((uint_8t*)s)[ 9] ^ ((uint_8t*)k)[ 9];
-    ((uint_8t*)d)[10] = ((uint_8t*)s)[10] ^ ((uint_8t*)k)[10];
-    ((uint_8t*)d)[11] = ((uint_8t*)s)[11] ^ ((uint_8t*)k)[11];
-    ((uint_8t*)d)[12] = ((uint_8t*)s)[12] ^ ((uint_8t*)k)[12];
-    ((uint_8t*)d)[13] = ((uint_8t*)s)[13] ^ ((uint_8t*)k)[13];
-    ((uint_8t*)d)[14] = ((uint_8t*)s)[14] ^ ((uint_8t*)k)[14];
-    ((uint_8t*)d)[15] = ((uint_8t*)s)[15] ^ ((uint_8t*)k)[15];
-#else
-    block16_copy(d, s);
-    xor_block(d, k);
-#endif
-}
-
-static void add_round_key( uint_8t d[N_BLOCK], const uint_8t k[N_BLOCK] )
-{
-    xor_block(d, k);
-}
-
-static void shift_sub_rows( uint_8t st[N_BLOCK] )
-{   uint_8t tt;
-
-    st[ 0] = s_box[st[ 0]]; st[ 4] = s_box[st[ 4]];
-    st[ 8] = s_box[st[ 8]]; st[12] = s_box[st[12]];
-
-    tt = st[1]; st[ 1] = s_box[st[ 5]]; st[ 5] = s_box[st[ 9]];
-    st[ 9] = s_box[st[13]]; st[13] = s_box[ tt ];
-
-    tt = st[2]; st[ 2] = s_box[st[10]]; st[10] = s_box[ tt ];
-    tt = st[6]; st[ 6] = s_box[st[14]]; st[14] = s_box[ tt ];
-
-    tt = st[15]; st[15] = s_box[st[11]]; st[11] = s_box[st[ 7]];
-    st[ 7] = s_box[st[ 3]]; st[ 3] = s_box[ tt ];
-}
-
-static void inv_shift_sub_rows( uint_8t st[N_BLOCK] )
-{   uint_8t tt;
-
-    st[ 0] = inv_s_box[st[ 0]]; st[ 4] = inv_s_box[st[ 4]];
-    st[ 8] = inv_s_box[st[ 8]]; st[12] = inv_s_box[st[12]];
-
-    tt = st[13]; st[13] = inv_s_box[st[9]]; st[ 9] = inv_s_box[st[5]];
-    st[ 5] = inv_s_box[st[1]]; st[ 1] = inv_s_box[ tt ];
-
-    tt = st[2]; st[ 2] = inv_s_box[st[10]]; st[10] = inv_s_box[ tt ];
-    tt = st[6]; st[ 6] = inv_s_box[st[14]]; st[14] = inv_s_box[ tt ];
-
-    tt = st[3]; st[ 3] = inv_s_box[st[ 7]]; st[ 7] = inv_s_box[st[11]];
-    st[11] = inv_s_box[st[15]]; st[15] = inv_s_box[ tt ];
-}
-
-#if defined( VERSION_1 )
-  static void mix_sub_columns( uint_8t dt[N_BLOCK] )
-  { uint_8t st[N_BLOCK];
-    block16_copy(st, dt);
-#else
-  static void mix_sub_columns( uint_8t dt[N_BLOCK], uint_8t st[N_BLOCK] )
-  {
-#endif
-    dt[ 0] = gfm2_s_box[st[0]] ^ gfm3_s_box[st[5]] ^ s_box[st[10]] ^ s_box[st[15]];
-    dt[ 1] = s_box[st[0]] ^ gfm2_s_box[st[5]] ^ gfm3_s_box[st[10]] ^ s_box[st[15]];
-    dt[ 2] = s_box[st[0]] ^ s_box[st[5]] ^ gfm2_s_box[st[10]] ^ gfm3_s_box[st[15]];
-    dt[ 3] = gfm3_s_box[st[0]] ^ s_box[st[5]] ^ s_box[st[10]] ^ gfm2_s_box[st[15]];
-
-    dt[ 4] = gfm2_s_box[st[4]] ^ gfm3_s_box[st[9]] ^ s_box[st[14]] ^ s_box[st[3]];
-    dt[ 5] = s_box[st[4]] ^ gfm2_s_box[st[9]] ^ gfm3_s_box[st[14]] ^ s_box[st[3]];
-    dt[ 6] = s_box[st[4]] ^ s_box[st[9]] ^ gfm2_s_box[st[14]] ^ gfm3_s_box[st[3]];
-    dt[ 7] = gfm3_s_box[st[4]] ^ s_box[st[9]] ^ s_box[st[14]] ^ gfm2_s_box[st[3]];
-
-    dt[ 8] = gfm2_s_box[st[8]] ^ gfm3_s_box[st[13]] ^ s_box[st[2]] ^ s_box[st[7]];
-    dt[ 9] = s_box[st[8]] ^ gfm2_s_box[st[13]] ^ gfm3_s_box[st[2]] ^ s_box[st[7]];
-    dt[10] = s_box[st[8]] ^ s_box[st[13]] ^ gfm2_s_box[st[2]] ^ gfm3_s_box[st[7]];
-    dt[11] = gfm3_s_box[st[8]] ^ s_box[st[13]] ^ s_box[st[2]] ^ gfm2_s_box[st[7]];
-
-    dt[12] = gfm2_s_box[st[12]] ^ gfm3_s_box[st[1]] ^ s_box[st[6]] ^ s_box[st[11]];
-    dt[13] = s_box[st[12]] ^ gfm2_s_box[st[1]] ^ gfm3_s_box[st[6]] ^ s_box[st[11]];
-    dt[14] = s_box[st[12]] ^ s_box[st[1]] ^ gfm2_s_box[st[6]] ^ gfm3_s_box[st[11]];
-    dt[15] = gfm3_s_box[st[12]] ^ s_box[st[1]] ^ s_box[st[6]] ^ gfm2_s_box[st[11]];
-  }
-
-#if defined( VERSION_1 )
-  static void inv_mix_sub_columns( uint_8t dt[N_BLOCK] )
-  { uint_8t st[N_BLOCK];
-    block16_copy(st, dt);
-#else
-  static void inv_mix_sub_columns( uint_8t dt[N_BLOCK], uint_8t st[N_BLOCK] )
-  {
-#endif
-    dt[ 0] = inv_s_box[gfmul_e[st[ 0]] ^ gfmul_b[st[ 1]] ^ gfmul_d[st[ 2]] ^ gfmul_9[st[ 3]]];
-    dt[ 5] = inv_s_box[gfmul_9[st[ 0]] ^ gfmul_e[st[ 1]] ^ gfmul_b[st[ 2]] ^ gfmul_d[st[ 3]]];
-    dt[10] = inv_s_box[gfmul_d[st[ 0]] ^ gfmul_9[st[ 1]] ^ gfmul_e[st[ 2]] ^ gfmul_b[st[ 3]]];
-    dt[15] = inv_s_box[gfmul_b[st[ 0]] ^ gfmul_d[st[ 1]] ^ gfmul_9[st[ 2]] ^ gfmul_e[st[ 3]]];
-
-    dt[ 4] = inv_s_box[gfmul_e[st[ 4]] ^ gfmul_b[st[ 5]] ^ gfmul_d[st[ 6]] ^ gfmul_9[st[ 7]]];
-    dt[ 9] = inv_s_box[gfmul_9[st[ 4]] ^ gfmul_e[st[ 5]] ^ gfmul_b[st[ 6]] ^ gfmul_d[st[ 7]]];
-    dt[14] = inv_s_box[gfmul_d[st[ 4]] ^ gfmul_9[st[ 5]] ^ gfmul_e[st[ 6]] ^ gfmul_b[st[ 7]]];
-    dt[ 3] = inv_s_box[gfmul_b[st[ 4]] ^ gfmul_d[st[ 5]] ^ gfmul_9[st[ 6]] ^ gfmul_e[st[ 7]]];
-
-    dt[ 8] = inv_s_box[gfmul_e[st[ 8]] ^ gfmul_b[st[ 9]] ^ gfmul_d[st[10]] ^ gfmul_9[st[11]]];
-    dt[13] = inv_s_box[gfmul_9[st[ 8]] ^ gfmul_e[st[ 9]] ^ gfmul_b[st[10]] ^ gfmul_d[st[11]]];
-    dt[ 2] = inv_s_box[gfmul_d[st[ 8]] ^ gfmul_9[st[ 9]] ^ gfmul_e[st[10]] ^ gfmul_b[st[11]]];
-    dt[ 7] = inv_s_box[gfmul_b[st[ 8]] ^ gfmul_d[st[ 9]] ^ gfmul_9[st[10]] ^ gfmul_e[st[11]]];
-
-    dt[12] = inv_s_box[gfmul_e[st[12]] ^ gfmul_b[st[13]] ^ gfmul_d[st[14]] ^ gfmul_9[st[15]]];
-    dt[ 1] = inv_s_box[gfmul_9[st[12]] ^ gfmul_e[st[13]] ^ gfmul_b[st[14]] ^ gfmul_d[st[15]]];
-    dt[ 6] = inv_s_box[gfmul_d[st[12]] ^ gfmul_9[st[13]] ^ gfmul_e[st[14]] ^ gfmul_b[st[15]]];
-    dt[11] = inv_s_box[gfmul_b[st[12]] ^ gfmul_d[st[13]] ^ gfmul_9[st[14]] ^ gfmul_e[st[15]]];
-  }
-
-#if defined( AES_ENC_PREKEYED ) || defined( AES_DEC_PREKEYED )
-
-/*  Set the cipher key for the pre-keyed version */
-
-return_type aes_set_key( const unsigned char key[], length_type keylen, aes_context ctx[1] )
-{
-    uint_8t cc, rc, hi;
-
-    switch( keylen )
-    {
-    case 16:
-    case 128:
-        keylen = 16;
-        break;
-    case 24:
-    case 192:
-        keylen = 24;
-        break;
-    case 32:
-    case 256:
-        keylen = 32;
-        break;
-    default:
-        ctx->rnd = 0;
-        return (return_type) -1;
-    }
-    block_copy(ctx->ksch, key, keylen);
-    hi = (keylen + 28) << 2;
-    ctx->rnd = (hi >> 4) - 1;
-    for( cc = keylen, rc = 1; cc < hi; cc += 4 )
-    {   uint_8t tt, t0, t1, t2, t3;
-
-        t0 = ctx->ksch[cc - 4];
-        t1 = ctx->ksch[cc - 3];
-        t2 = ctx->ksch[cc - 2];
-        t3 = ctx->ksch[cc - 1];
-        if( cc % keylen == 0 )
-        {
-            tt = t0;
-            t0 = s_box[t1] ^ rc;
-            t1 = s_box[t2];
-            t2 = s_box[t3];
-            t3 = s_box[tt];
-            rc = f2(rc);
-        }
-        else if( keylen > 24 && cc % keylen == 16 )
-        {
-            t0 = s_box[t0];
-            t1 = s_box[t1];
-            t2 = s_box[t2];
-            t3 = s_box[t3];
-        }
-        tt = cc - keylen;
-        ctx->ksch[cc + 0] = ctx->ksch[tt + 0] ^ t0;
-        ctx->ksch[cc + 1] = ctx->ksch[tt + 1] ^ t1;
-        ctx->ksch[cc + 2] = ctx->ksch[tt + 2] ^ t2;
-        ctx->ksch[cc + 3] = ctx->ksch[tt + 3] ^ t3;
-    }
-    return 0;
-}
-
-#endif
-
-#if defined( AES_ENC_PREKEYED )
-
-/*  Encrypt a single block of 16 bytes */
-
-return_type aes_encrypt( const unsigned char in[N_BLOCK], unsigned char  out[N_BLOCK], const aes_context ctx[1] )
-{
-    if( ctx->rnd )
-    {
-        uint_8t s1[N_BLOCK], r;
-        copy_and_key( s1, in, ctx->ksch );
-
-        for( r = 1 ; r < ctx->rnd ; ++r )
-#if defined( VERSION_1 )
-        {
-            mix_sub_columns( s1 );
-            add_round_key( s1, ctx->ksch + r * N_BLOCK);
-        }
-#else
-        {   uint_8t s2[N_BLOCK];
-            mix_sub_columns( s2, s1 );
-            copy_and_key( s1, s2, ctx->ksch + r * N_BLOCK);
-        }
-#endif
-        shift_sub_rows( s1 );
-        copy_and_key( out, s1, ctx->ksch + r * N_BLOCK );
-    }
-    else
-        return (return_type) -1;
-    return 0;
-}
-
-#endif
-
-#if defined( AES_DEC_PREKEYED )
-
-/*  Decrypt a single block of 16 bytes */
-
-return_type aes_decrypt( const unsigned char in[N_BLOCK], unsigned char out[N_BLOCK], const aes_context ctx[1] )
-{
-    if( ctx->rnd )
-    {
-        uint_8t s1[N_BLOCK], r;
-        copy_and_key( s1, in, ctx->ksch + ctx->rnd * N_BLOCK );
-        inv_shift_sub_rows( s1 );
-
-        for( r = ctx->rnd ; --r ; )
-#if defined( VERSION_1 )
-        {
-            add_round_key( s1, ctx->ksch + r * N_BLOCK );
-            inv_mix_sub_columns( s1 );
-        }
-#else
-        {   uint_8t s2[N_BLOCK];
-            copy_and_key( s2, s1, ctx->ksch + r * N_BLOCK );
-            inv_mix_sub_columns( s1, s2 );
-        }
-#endif
-        copy_and_key( out, s1, ctx->ksch );
-    }
-    else
-        return (return_type) -1;
-    return 0;
-}
-
-#endif
-
-#if defined( AES_ENC_128_OTFK )
-
-/*  The 'on the fly' encryption key update for for 128 bit keys */
-
-static void update_encrypt_key_128( uint_8t k[N_BLOCK], uint_8t *rc )
-{   uint_8t cc;
-
-    k[0] ^= s_box[k[13]] ^ *rc;
-    k[1] ^= s_box[k[14]];
-    k[2] ^= s_box[k[15]];
-    k[3] ^= s_box[k[12]];
-    *rc = f2( *rc );
-
-    for(cc = 4; cc < 16; cc += 4 )
-    {
-        k[cc + 0] ^= k[cc - 4];
-        k[cc + 1] ^= k[cc - 3];
-        k[cc + 2] ^= k[cc - 2];
-        k[cc + 3] ^= k[cc - 1];
-    }
-}
-
-/*  Encrypt a single block of 16 bytes with 'on the fly' 128 bit keying */
-
-void aes_encrypt_128( const unsigned char in[N_BLOCK], unsigned char out[N_BLOCK],
-                     const unsigned char key[N_BLOCK], unsigned char o_key[N_BLOCK] )
-{   uint_8t s1[N_BLOCK], r, rc = 1;
-
-    if(o_key != key)
-        block16_copy( o_key, key );
-    copy_and_key( s1, in, o_key );
-
-    for( r = 1 ; r < 10 ; ++r )
-#if defined( VERSION_1 )
-    {
-        mix_sub_columns( s1 );
-        update_encrypt_key_128( o_key, &rc );
-        add_round_key( s1, o_key );
-    }
-#else
-    {   uint_8t s2[N_BLOCK];
-        mix_sub_columns( s2, s1 );
-        update_encrypt_key_128( o_key, &rc );
-        copy_and_key( s1, s2, o_key );
-    }
-#endif
-
-    shift_sub_rows( s1 );
-    update_encrypt_key_128( o_key, &rc );
-    copy_and_key( out, s1, o_key );
-}
-
-#endif
-
-#if defined( AES_DEC_128_OTFK )
-
-/*  The 'on the fly' decryption key update for for 128 bit keys */
-
-static void update_decrypt_key_128( uint_8t k[N_BLOCK], uint_8t *rc )
-{   uint_8t cc;
-
-    for( cc = 12; cc > 0; cc -= 4 )
-    {
-        k[cc + 0] ^= k[cc - 4];
-        k[cc + 1] ^= k[cc - 3];
-        k[cc + 2] ^= k[cc - 2];
-        k[cc + 3] ^= k[cc - 1];
-    }
-    *rc = d2(*rc);
-    k[0] ^= s_box[k[13]] ^ *rc;
-    k[1] ^= s_box[k[14]];
-    k[2] ^= s_box[k[15]];
-    k[3] ^= s_box[k[12]];
-}
-
-/*  Decrypt a single block of 16 bytes with 'on the fly' 128 bit keying */
-
-void aes_decrypt_128( const unsigned char in[N_BLOCK], unsigned char out[N_BLOCK],
-                      const unsigned char key[N_BLOCK], unsigned char o_key[N_BLOCK] )
-{
-    uint_8t s1[N_BLOCK], r, rc = 0x6c;
-    if(o_key != key)
-        block16_copy( o_key, key );
-
-    copy_and_key( s1, in, o_key );
-    inv_shift_sub_rows( s1 );
-
-    for( r = 10 ; --r ; )
-#if defined( VERSION_1 )
-    {
-        update_decrypt_key_128( o_key, &rc );
-        add_round_key( s1, o_key );
-        inv_mix_sub_columns( s1 );
-    }
-#else
-    {   uint_8t s2[N_BLOCK];
-        update_decrypt_key_128( o_key, &rc );
-        copy_and_key( s2, s1, o_key );
-        inv_mix_sub_columns( s1, s2 );
-    }
-#endif
-    update_decrypt_key_128( o_key, &rc );
-    copy_and_key( out, s1, o_key );
-}
-
-#endif
-
-#if defined( AES_ENC_256_OTFK )
-
-/*  The 'on the fly' encryption key update for for 256 bit keys */
-
-static void update_encrypt_key_256( uint_8t k[2 * N_BLOCK], uint_8t *rc )
-{   uint_8t cc;
-
-    k[0] ^= s_box[k[29]] ^ *rc;
-    k[1] ^= s_box[k[30]];
-    k[2] ^= s_box[k[31]];
-    k[3] ^= s_box[k[28]];
-    *rc = f2( *rc );
-
-    for(cc = 4; cc < 16; cc += 4)
-    {
-        k[cc + 0] ^= k[cc - 4];
-        k[cc + 1] ^= k[cc - 3];
-        k[cc + 2] ^= k[cc - 2];
-        k[cc + 3] ^= k[cc - 1];
-    }
-
-    k[16] ^= s_box[k[12]];
-    k[17] ^= s_box[k[13]];
-    k[18] ^= s_box[k[14]];
-    k[19] ^= s_box[k[15]];
-
-    for( cc = 20; cc < 32; cc += 4 )
-    {
-        k[cc + 0] ^= k[cc - 4];
-        k[cc + 1] ^= k[cc - 3];
-        k[cc + 2] ^= k[cc - 2];
-        k[cc + 3] ^= k[cc - 1];
-    }
-}
-
-/*  Encrypt a single block of 16 bytes with 'on the fly' 256 bit keying */
-
-void aes_encrypt_256( const unsigned char in[N_BLOCK], unsigned char out[N_BLOCK],
-                      const unsigned char key[2 * N_BLOCK], unsigned char o_key[2 * N_BLOCK] )
-{
-    uint_8t s1[N_BLOCK], r, rc = 1;
-    if(o_key != key)
-    {
-        block16_copy( o_key, key );
-        block16_copy( o_key + 16, key + 16 );
-    }
-    copy_and_key( s1, in, o_key );
-
-    for( r = 1 ; r < 14 ; ++r )
-#if defined( VERSION_1 )
-    {
-        mix_sub_columns(s1);
-        if( r & 1 )
-            add_round_key( s1, o_key + 16 );
-        else
-        {
-            update_encrypt_key_256( o_key, &rc );
-            add_round_key( s1, o_key );
-        }
-    }
-#else
-    {   uint_8t s2[N_BLOCK];
-        mix_sub_columns( s2, s1 );
-        if( r & 1 )
-            copy_and_key( s1, s2, o_key + 16 );
-        else
-        {
-            update_encrypt_key_256( o_key, &rc );
-            copy_and_key( s1, s2, o_key );
-        }
-    }
-#endif
-
-    shift_sub_rows( s1 );
-    update_encrypt_key_256( o_key, &rc );
-    copy_and_key( out, s1, o_key );
-}
-
-#endif
-
-#if defined( AES_DEC_256_OTFK )
-
-/*  The 'on the fly' encryption key update for for 256 bit keys */
-
-static void update_decrypt_key_256( uint_8t k[2 * N_BLOCK], uint_8t *rc )
-{   uint_8t cc;
-
-    for(cc = 28; cc > 16; cc -= 4)
-    {
-        k[cc + 0] ^= k[cc - 4];
-        k[cc + 1] ^= k[cc - 3];
-        k[cc + 2] ^= k[cc - 2];
-        k[cc + 3] ^= k[cc - 1];
-    }
-
-    k[16] ^= s_box[k[12]];
-    k[17] ^= s_box[k[13]];
-    k[18] ^= s_box[k[14]];
-    k[19] ^= s_box[k[15]];
-
-    for(cc = 12; cc > 0; cc -= 4)
-    {
-        k[cc + 0] ^= k[cc - 4];
-        k[cc + 1] ^= k[cc - 3];
-        k[cc + 2] ^= k[cc - 2];
-        k[cc + 3] ^= k[cc - 1];
-    }
-
-    *rc = d2(*rc);
-    k[0] ^= s_box[k[29]] ^ *rc;
-    k[1] ^= s_box[k[30]];
-    k[2] ^= s_box[k[31]];
-    k[3] ^= s_box[k[28]];
-}
-
-/*  Decrypt a single block of 16 bytes with 'on the fly'
-    256 bit keying
-*/
-void aes_decrypt_256( const unsigned char in[N_BLOCK], unsigned char out[N_BLOCK],
-                      const unsigned char key[2 * N_BLOCK], unsigned char o_key[2 * N_BLOCK] )
-{
-    uint_8t s1[N_BLOCK], r, rc = 0x80;
-
-    if(o_key != key)
-    {
-        block16_copy( o_key, key );
-        block16_copy( o_key + 16, key + 16 );
-    }
-
-    copy_and_key( s1, in, o_key );
-    inv_shift_sub_rows( s1 );
-
-    for( r = 14 ; --r ; )
-#if defined( VERSION_1 )
-    {
-        if( ( r & 1 ) )
-        {
-            update_decrypt_key_256( o_key, &rc );
-            add_round_key( s1, o_key + 16 );
-        }
-        else
-            add_round_key( s1, o_key );
-        inv_mix_sub_columns( s1 );
-    }
-#else
-    {   uint_8t s2[N_BLOCK];
-        if( ( r & 1 ) )
-        {
-            update_decrypt_key_256( o_key, &rc );
-            copy_and_key( s2, s1, o_key + 16 );
-        }
-        else
-            copy_and_key( s2, s1, o_key );
-        inv_mix_sub_columns( s1, s2 );
-    }
-#endif
-    copy_and_key( out, s1, o_key );
-}
-
-#endif
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 09/09/2006
+
+ This is an AES implementation that uses only 8-bit byte operations on the
+ cipher state (there are options to use 32-bit types if available).
+
+ The combination of mix columns and byte substitution used here is based on
+ that developed by Karl Malbrain. His contribution is acknowledged.
+ */
+
+/* Adapted for TrueCrypt:
+  - Macro-generated tables were replaced with static data to enable compiling
+    with MSVC++ 1.5 which runs out of resources when expanding large macros.
+*/
+
+#pragma optimize ("t", on)
+
+/* define if you have a fast memcpy function on your system */
+#if 1
+#  define HAVE_MEMCPY
+#  include <string.h>
+#  if defined( _MSC_VER )
+#    ifndef DEBUG
+#      pragma intrinsic( memcpy )
+#    endif
+#  endif
+#endif
+
+/* define if you have fast 32-bit types on your system */
+#if 1
+#  define HAVE_UINT_32T
+#endif
+
+/* alternative versions (test for performance on your system) */
+#if 0
+#  define VERSION_1
+#endif
+
+#include "AesSmall.h"
+
+#define WPOLY   0x011b
+#define DPOLY   0x008d
+#define f1(x)   (x)
+#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
+#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
+#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) \
+                        ^ (((x>>5) & 4) * WPOLY))
+#define d2(x)   (((x) >> 1) ^ ((x) & 1 ? DPOLY : 0))
+
+#define f3(x)   (f2(x) ^ x)
+#define f9(x)   (f8(x) ^ x)
+#define fb(x)   (f8(x) ^ f2(x) ^ x)
+#define fd(x)   (f8(x) ^ f4(x) ^ x)
+#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
+
+static const uint_8t s_box[256] = {
+	0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,
+	0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
+	0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,
+	0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
+	0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,
+	0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
+	0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,
+	0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
+	0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,
+	0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
+	0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,
+	0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
+	0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,
+	0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
+	0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,
+	0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
+	0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,
+	0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
+	0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,
+	0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
+	0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,
+	0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
+	0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,
+	0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
+	0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,
+	0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
+	0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,
+	0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
+	0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,
+	0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
+	0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,
+	0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+};
+
+static const uint_8t inv_s_box[256] = {
+	0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38,
+	0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb,
+	0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87,
+	0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb,
+	0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d,
+	0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e,
+	0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2,
+	0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25,
+	0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16,
+	0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92,
+	0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda,
+	0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84,
+	0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a,
+	0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06,
+	0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02,
+	0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b,
+	0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea,
+	0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73,
+	0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85,
+	0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e,
+	0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89,
+	0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b,
+	0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20,
+	0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4,
+	0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31,
+	0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f,
+	0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d,
+	0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef,
+	0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0,
+	0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61,
+	0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26,
+	0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+};
+
+static const uint_8t gfm2_s_box[256] = {
+	0xc6,0xf8,0xee,0xf6,0xff,0xd6,0xde,0x91,
+	0x60,0x02,0xce,0x56,0xe7,0xb5,0x4d,0xec,
+	0x8f,0x1f,0x89,0xfa,0xef,0xb2,0x8e,0xfb,
+	0x41,0xb3,0x5f,0x45,0x23,0x53,0xe4,0x9b,
+	0x75,0xe1,0x3d,0x4c,0x6c,0x7e,0xf5,0x83,
+	0x68,0x51,0xd1,0xf9,0xe2,0xab,0x62,0x2a,
+	0x08,0x95,0x46,0x9d,0x30,0x37,0x0a,0x2f,
+	0x0e,0x24,0x1b,0xdf,0xcd,0x4e,0x7f,0xea,
+	0x12,0x1d,0x58,0x34,0x36,0xdc,0xb4,0x5b,
+	0xa4,0x76,0xb7,0x7d,0x52,0xdd,0x5e,0x13,
+	0xa6,0xb9,0x00,0xc1,0x40,0xe3,0x79,0xb6,
+	0xd4,0x8d,0x67,0x72,0x94,0x98,0xb0,0x85,
+	0xbb,0xc5,0x4f,0xed,0x86,0x9a,0x66,0x11,
+	0x8a,0xe9,0x04,0xfe,0xa0,0x78,0x25,0x4b,
+	0xa2,0x5d,0x80,0x05,0x3f,0x21,0x70,0xf1,
+	0x63,0x77,0xaf,0x42,0x20,0xe5,0xfd,0xbf,
+	0x81,0x18,0x26,0xc3,0xbe,0x35,0x88,0x2e,
+	0x93,0x55,0xfc,0x7a,0xc8,0xba,0x32,0xe6,
+	0xc0,0x19,0x9e,0xa3,0x44,0x54,0x3b,0x0b,
+	0x8c,0xc7,0x6b,0x28,0xa7,0xbc,0x16,0xad,
+	0xdb,0x64,0x74,0x14,0x92,0x0c,0x48,0xb8,
+	0x9f,0xbd,0x43,0xc4,0x39,0x31,0xd3,0xf2,
+	0xd5,0x8b,0x6e,0xda,0x01,0xb1,0x9c,0x49,
+	0xd8,0xac,0xf3,0xcf,0xca,0xf4,0x47,0x10,
+	0x6f,0xf0,0x4a,0x5c,0x38,0x57,0x73,0x97,
+	0xcb,0xa1,0xe8,0x3e,0x96,0x61,0x0d,0x0f,
+	0xe0,0x7c,0x71,0xcc,0x90,0x06,0xf7,0x1c,
+	0xc2,0x6a,0xae,0x69,0x17,0x99,0x3a,0x27,
+	0xd9,0xeb,0x2b,0x22,0xd2,0xa9,0x07,0x33,
+	0x2d,0x3c,0x15,0xc9,0x87,0xaa,0x50,0xa5,
+	0x03,0x59,0x09,0x1a,0x65,0xd7,0x84,0xd0,
+	0x82,0x29,0x5a,0x1e,0x7b,0xa8,0x6d,0x2c
+};
+
+static const uint_8t gfm3_s_box[256] = {
+	0xa5,0x84,0x99,0x8d,0x0d,0xbd,0xb1,0x54,
+	0x50,0x03,0xa9,0x7d,0x19,0x62,0xe6,0x9a,
+	0x45,0x9d,0x40,0x87,0x15,0xeb,0xc9,0x0b,
+	0xec,0x67,0xfd,0xea,0xbf,0xf7,0x96,0x5b,
+	0xc2,0x1c,0xae,0x6a,0x5a,0x41,0x02,0x4f,
+	0x5c,0xf4,0x34,0x08,0x93,0x73,0x53,0x3f,
+	0x0c,0x52,0x65,0x5e,0x28,0xa1,0x0f,0xb5,
+	0x09,0x36,0x9b,0x3d,0x26,0x69,0xcd,0x9f,
+	0x1b,0x9e,0x74,0x2e,0x2d,0xb2,0xee,0xfb,
+	0xf6,0x4d,0x61,0xce,0x7b,0x3e,0x71,0x97,
+	0xf5,0x68,0x00,0x2c,0x60,0x1f,0xc8,0xed,
+	0xbe,0x46,0xd9,0x4b,0xde,0xd4,0xe8,0x4a,
+	0x6b,0x2a,0xe5,0x16,0xc5,0xd7,0x55,0x94,
+	0xcf,0x10,0x06,0x81,0xf0,0x44,0xba,0xe3,
+	0xf3,0xfe,0xc0,0x8a,0xad,0xbc,0x48,0x04,
+	0xdf,0xc1,0x75,0x63,0x30,0x1a,0x0e,0x6d,
+	0x4c,0x14,0x35,0x2f,0xe1,0xa2,0xcc,0x39,
+	0x57,0xf2,0x82,0x47,0xac,0xe7,0x2b,0x95,
+	0xa0,0x98,0xd1,0x7f,0x66,0x7e,0xab,0x83,
+	0xca,0x29,0xd3,0x3c,0x79,0xe2,0x1d,0x76,
+	0x3b,0x56,0x4e,0x1e,0xdb,0x0a,0x6c,0xe4,
+	0x5d,0x6e,0xef,0xa6,0xa8,0xa4,0x37,0x8b,
+	0x32,0x43,0x59,0xb7,0x8c,0x64,0xd2,0xe0,
+	0xb4,0xfa,0x07,0x25,0xaf,0x8e,0xe9,0x18,
+	0xd5,0x88,0x6f,0x72,0x24,0xf1,0xc7,0x51,
+	0x23,0x7c,0x9c,0x21,0xdd,0xdc,0x86,0x85,
+	0x90,0x42,0xc4,0xaa,0xd8,0x05,0x01,0x12,
+	0xa3,0x5f,0xf9,0xd0,0x91,0x58,0x27,0xb9,
+	0x38,0x13,0xb3,0x33,0xbb,0x70,0x89,0xa7,
+	0xb6,0x22,0x92,0x20,0x49,0xff,0x78,0x7a,
+	0x8f,0xf8,0x80,0x17,0xda,0x31,0xc6,0xb8,
+	0xc3,0xb0,0x77,0x11,0xcb,0xfc,0xd6,0x3a
+};
+
+static const uint_8t gfmul_9[256] = {
+	0x00,0x09,0x12,0x1b,0x24,0x2d,0x36,0x3f,
+	0x48,0x41,0x5a,0x53,0x6c,0x65,0x7e,0x77,
+	0x90,0x99,0x82,0x8b,0xb4,0xbd,0xa6,0xaf,
+	0xd8,0xd1,0xca,0xc3,0xfc,0xf5,0xee,0xe7,
+	0x3b,0x32,0x29,0x20,0x1f,0x16,0x0d,0x04,
+	0x73,0x7a,0x61,0x68,0x57,0x5e,0x45,0x4c,
+	0xab,0xa2,0xb9,0xb0,0x8f,0x86,0x9d,0x94,
+	0xe3,0xea,0xf1,0xf8,0xc7,0xce,0xd5,0xdc,
+	0x76,0x7f,0x64,0x6d,0x52,0x5b,0x40,0x49,
+	0x3e,0x37,0x2c,0x25,0x1a,0x13,0x08,0x01,
+	0xe6,0xef,0xf4,0xfd,0xc2,0xcb,0xd0,0xd9,
+	0xae,0xa7,0xbc,0xb5,0x8a,0x83,0x98,0x91,
+	0x4d,0x44,0x5f,0x56,0x69,0x60,0x7b,0x72,
+	0x05,0x0c,0x17,0x1e,0x21,0x28,0x33,0x3a,
+	0xdd,0xd4,0xcf,0xc6,0xf9,0xf0,0xeb,0xe2,
+	0x95,0x9c,0x87,0x8e,0xb1,0xb8,0xa3,0xaa,
+	0xec,0xe5,0xfe,0xf7,0xc8,0xc1,0xda,0xd3,
+	0xa4,0xad,0xb6,0xbf,0x80,0x89,0x92,0x9b,
+	0x7c,0x75,0x6e,0x67,0x58,0x51,0x4a,0x43,
+	0x34,0x3d,0x26,0x2f,0x10,0x19,0x02,0x0b,
+	0xd7,0xde,0xc5,0xcc,0xf3,0xfa,0xe1,0xe8,
+	0x9f,0x96,0x8d,0x84,0xbb,0xb2,0xa9,0xa0,
+	0x47,0x4e,0x55,0x5c,0x63,0x6a,0x71,0x78,
+	0x0f,0x06,0x1d,0x14,0x2b,0x22,0x39,0x30,
+	0x9a,0x93,0x88,0x81,0xbe,0xb7,0xac,0xa5,
+	0xd2,0xdb,0xc0,0xc9,0xf6,0xff,0xe4,0xed,
+	0x0a,0x03,0x18,0x11,0x2e,0x27,0x3c,0x35,
+	0x42,0x4b,0x50,0x59,0x66,0x6f,0x74,0x7d,
+	0xa1,0xa8,0xb3,0xba,0x85,0x8c,0x97,0x9e,
+	0xe9,0xe0,0xfb,0xf2,0xcd,0xc4,0xdf,0xd6,
+	0x31,0x38,0x23,0x2a,0x15,0x1c,0x07,0x0e,
+	0x79,0x70,0x6b,0x62,0x5d,0x54,0x4f,0x46
+};
+
+static const uint_8t gfmul_b[256] = {
+	0x00,0x0b,0x16,0x1d,0x2c,0x27,0x3a,0x31,
+	0x58,0x53,0x4e,0x45,0x74,0x7f,0x62,0x69,
+	0xb0,0xbb,0xa6,0xad,0x9c,0x97,0x8a,0x81,
+	0xe8,0xe3,0xfe,0xf5,0xc4,0xcf,0xd2,0xd9,
+	0x7b,0x70,0x6d,0x66,0x57,0x5c,0x41,0x4a,
+	0x23,0x28,0x35,0x3e,0x0f,0x04,0x19,0x12,
+	0xcb,0xc0,0xdd,0xd6,0xe7,0xec,0xf1,0xfa,
+	0x93,0x98,0x85,0x8e,0xbf,0xb4,0xa9,0xa2,
+	0xf6,0xfd,0xe0,0xeb,0xda,0xd1,0xcc,0xc7,
+	0xae,0xa5,0xb8,0xb3,0x82,0x89,0x94,0x9f,
+	0x46,0x4d,0x50,0x5b,0x6a,0x61,0x7c,0x77,
+	0x1e,0x15,0x08,0x03,0x32,0x39,0x24,0x2f,
+	0x8d,0x86,0x9b,0x90,0xa1,0xaa,0xb7,0xbc,
+	0xd5,0xde,0xc3,0xc8,0xf9,0xf2,0xef,0xe4,
+	0x3d,0x36,0x2b,0x20,0x11,0x1a,0x07,0x0c,
+	0x65,0x6e,0x73,0x78,0x49,0x42,0x5f,0x54,
+	0xf7,0xfc,0xe1,0xea,0xdb,0xd0,0xcd,0xc6,
+	0xaf,0xa4,0xb9,0xb2,0x83,0x88,0x95,0x9e,
+	0x47,0x4c,0x51,0x5a,0x6b,0x60,0x7d,0x76,
+	0x1f,0x14,0x09,0x02,0x33,0x38,0x25,0x2e,
+	0x8c,0x87,0x9a,0x91,0xa0,0xab,0xb6,0xbd,
+	0xd4,0xdf,0xc2,0xc9,0xf8,0xf3,0xee,0xe5,
+	0x3c,0x37,0x2a,0x21,0x10,0x1b,0x06,0x0d,
+	0x64,0x6f,0x72,0x79,0x48,0x43,0x5e,0x55,
+	0x01,0x0a,0x17,0x1c,0x2d,0x26,0x3b,0x30,
+	0x59,0x52,0x4f,0x44,0x75,0x7e,0x63,0x68,
+	0xb1,0xba,0xa7,0xac,0x9d,0x96,0x8b,0x80,
+	0xe9,0xe2,0xff,0xf4,0xc5,0xce,0xd3,0xd8,
+	0x7a,0x71,0x6c,0x67,0x56,0x5d,0x40,0x4b,
+	0x22,0x29,0x34,0x3f,0x0e,0x05,0x18,0x13,
+	0xca,0xc1,0xdc,0xd7,0xe6,0xed,0xf0,0xfb,
+	0x92,0x99,0x84,0x8f,0xbe,0xb5,0xa8,0xa3
+};
+
+static const uint_8t gfmul_d[256] = {
+	0x00,0x0d,0x1a,0x17,0x34,0x39,0x2e,0x23,
+	0x68,0x65,0x72,0x7f,0x5c,0x51,0x46,0x4b,
+	0xd0,0xdd,0xca,0xc7,0xe4,0xe9,0xfe,0xf3,
+	0xb8,0xb5,0xa2,0xaf,0x8c,0x81,0x96,0x9b,
+	0xbb,0xb6,0xa1,0xac,0x8f,0x82,0x95,0x98,
+	0xd3,0xde,0xc9,0xc4,0xe7,0xea,0xfd,0xf0,
+	0x6b,0x66,0x71,0x7c,0x5f,0x52,0x45,0x48,
+	0x03,0x0e,0x19,0x14,0x37,0x3a,0x2d,0x20,
+	0x6d,0x60,0x77,0x7a,0x59,0x54,0x43,0x4e,
+	0x05,0x08,0x1f,0x12,0x31,0x3c,0x2b,0x26,
+	0xbd,0xb0,0xa7,0xaa,0x89,0x84,0x93,0x9e,
+	0xd5,0xd8,0xcf,0xc2,0xe1,0xec,0xfb,0xf6,
+	0xd6,0xdb,0xcc,0xc1,0xe2,0xef,0xf8,0xf5,
+	0xbe,0xb3,0xa4,0xa9,0x8a,0x87,0x90,0x9d,
+	0x06,0x0b,0x1c,0x11,0x32,0x3f,0x28,0x25,
+	0x6e,0x63,0x74,0x79,0x5a,0x57,0x40,0x4d,
+	0xda,0xd7,0xc0,0xcd,0xee,0xe3,0xf4,0xf9,
+	0xb2,0xbf,0xa8,0xa5,0x86,0x8b,0x9c,0x91,
+	0x0a,0x07,0x10,0x1d,0x3e,0x33,0x24,0x29,
+	0x62,0x6f,0x78,0x75,0x56,0x5b,0x4c,0x41,
+	0x61,0x6c,0x7b,0x76,0x55,0x58,0x4f,0x42,
+	0x09,0x04,0x13,0x1e,0x3d,0x30,0x27,0x2a,
+	0xb1,0xbc,0xab,0xa6,0x85,0x88,0x9f,0x92,
+	0xd9,0xd4,0xc3,0xce,0xed,0xe0,0xf7,0xfa,
+	0xb7,0xba,0xad,0xa0,0x83,0x8e,0x99,0x94,
+	0xdf,0xd2,0xc5,0xc8,0xeb,0xe6,0xf1,0xfc,
+	0x67,0x6a,0x7d,0x70,0x53,0x5e,0x49,0x44,
+	0x0f,0x02,0x15,0x18,0x3b,0x36,0x21,0x2c,
+	0x0c,0x01,0x16,0x1b,0x38,0x35,0x22,0x2f,
+	0x64,0x69,0x7e,0x73,0x50,0x5d,0x4a,0x47,
+	0xdc,0xd1,0xc6,0xcb,0xe8,0xe5,0xf2,0xff,
+	0xb4,0xb9,0xae,0xa3,0x80,0x8d,0x9a,0x97
+};
+
+static const uint_8t gfmul_e[256] = {
+	0x00,0x0e,0x1c,0x12,0x38,0x36,0x24,0x2a,
+	0x70,0x7e,0x6c,0x62,0x48,0x46,0x54,0x5a,
+	0xe0,0xee,0xfc,0xf2,0xd8,0xd6,0xc4,0xca,
+	0x90,0x9e,0x8c,0x82,0xa8,0xa6,0xb4,0xba,
+	0xdb,0xd5,0xc7,0xc9,0xe3,0xed,0xff,0xf1,
+	0xab,0xa5,0xb7,0xb9,0x93,0x9d,0x8f,0x81,
+	0x3b,0x35,0x27,0x29,0x03,0x0d,0x1f,0x11,
+	0x4b,0x45,0x57,0x59,0x73,0x7d,0x6f,0x61,
+	0xad,0xa3,0xb1,0xbf,0x95,0x9b,0x89,0x87,
+	0xdd,0xd3,0xc1,0xcf,0xe5,0xeb,0xf9,0xf7,
+	0x4d,0x43,0x51,0x5f,0x75,0x7b,0x69,0x67,
+	0x3d,0x33,0x21,0x2f,0x05,0x0b,0x19,0x17,
+	0x76,0x78,0x6a,0x64,0x4e,0x40,0x52,0x5c,
+	0x06,0x08,0x1a,0x14,0x3e,0x30,0x22,0x2c,
+	0x96,0x98,0x8a,0x84,0xae,0xa0,0xb2,0xbc,
+	0xe6,0xe8,0xfa,0xf4,0xde,0xd0,0xc2,0xcc,
+	0x41,0x4f,0x5d,0x53,0x79,0x77,0x65,0x6b,
+	0x31,0x3f,0x2d,0x23,0x09,0x07,0x15,0x1b,
+	0xa1,0xaf,0xbd,0xb3,0x99,0x97,0x85,0x8b,
+	0xd1,0xdf,0xcd,0xc3,0xe9,0xe7,0xf5,0xfb,
+	0x9a,0x94,0x86,0x88,0xa2,0xac,0xbe,0xb0,
+	0xea,0xe4,0xf6,0xf8,0xd2,0xdc,0xce,0xc0,
+	0x7a,0x74,0x66,0x68,0x42,0x4c,0x5e,0x50,
+	0x0a,0x04,0x16,0x18,0x32,0x3c,0x2e,0x20,
+	0xec,0xe2,0xf0,0xfe,0xd4,0xda,0xc8,0xc6,
+	0x9c,0x92,0x80,0x8e,0xa4,0xaa,0xb8,0xb6,
+	0x0c,0x02,0x10,0x1e,0x34,0x3a,0x28,0x26,
+	0x7c,0x72,0x60,0x6e,0x44,0x4a,0x58,0x56,
+	0x37,0x39,0x2b,0x25,0x0f,0x01,0x13,0x1d,
+	0x47,0x49,0x5b,0x55,0x7f,0x71,0x63,0x6d,
+	0xd7,0xd9,0xcb,0xc5,0xef,0xe1,0xf3,0xfd,
+	0xa7,0xa9,0xbb,0xb5,0x9f,0x91,0x83,0x8d
+};
+
+#if defined( HAVE_UINT_32T )
+  typedef unsigned long uint_32t;
+#endif
+
+#if defined( HAVE_MEMCPY )
+#  define block_copy(d, s, l) memcpy(d, s, l)
+#  define block16_copy(d, s)  memcpy(d, s, N_BLOCK)
+#else
+#  define block_copy(d, s, l) copy_block(d, s, l)
+#  define block16_copy(d, s)  copy_block16(d, s)
+#endif
+
+/* block size 'nn' must be a multiple of four */
+
+static void copy_block16( void *d, const void *s )
+{
+#if defined( HAVE_UINT_32T )
+    ((uint_32t*)d)[ 0] = ((uint_32t*)s)[ 0];
+    ((uint_32t*)d)[ 1] = ((uint_32t*)s)[ 1];
+    ((uint_32t*)d)[ 2] = ((uint_32t*)s)[ 2];
+    ((uint_32t*)d)[ 3] = ((uint_32t*)s)[ 3];
+#else
+    ((uint_8t*)d)[ 0] = ((uint_8t*)s)[ 0];
+    ((uint_8t*)d)[ 1] = ((uint_8t*)s)[ 1];
+    ((uint_8t*)d)[ 2] = ((uint_8t*)s)[ 2];
+    ((uint_8t*)d)[ 3] = ((uint_8t*)s)[ 3];
+    ((uint_8t*)d)[ 4] = ((uint_8t*)s)[ 4];
+    ((uint_8t*)d)[ 5] = ((uint_8t*)s)[ 5];
+    ((uint_8t*)d)[ 6] = ((uint_8t*)s)[ 6];
+    ((uint_8t*)d)[ 7] = ((uint_8t*)s)[ 7];
+    ((uint_8t*)d)[ 8] = ((uint_8t*)s)[ 8];
+    ((uint_8t*)d)[ 9] = ((uint_8t*)s)[ 9];
+    ((uint_8t*)d)[10] = ((uint_8t*)s)[10];
+    ((uint_8t*)d)[11] = ((uint_8t*)s)[11];
+    ((uint_8t*)d)[12] = ((uint_8t*)s)[12];
+    ((uint_8t*)d)[13] = ((uint_8t*)s)[13];
+    ((uint_8t*)d)[14] = ((uint_8t*)s)[14];
+    ((uint_8t*)d)[15] = ((uint_8t*)s)[15];
+#endif
+}
+
+static void copy_block( void * d, void *s, uint_8t nn )
+{
+    while( nn-- )
+        *((uint_8t*)d)++ = *((uint_8t*)s)++;
+}
+
+static void xor_block( void *d, const void *s )
+{
+#if defined( HAVE_UINT_32T )
+    ((uint_32t*)d)[ 0] ^= ((uint_32t*)s)[ 0];
+    ((uint_32t*)d)[ 1] ^= ((uint_32t*)s)[ 1];
+    ((uint_32t*)d)[ 2] ^= ((uint_32t*)s)[ 2];
+    ((uint_32t*)d)[ 3] ^= ((uint_32t*)s)[ 3];
+#else
+    ((uint_8t*)d)[ 0] ^= ((uint_8t*)s)[ 0];
+    ((uint_8t*)d)[ 1] ^= ((uint_8t*)s)[ 1];
+    ((uint_8t*)d)[ 2] ^= ((uint_8t*)s)[ 2];
+    ((uint_8t*)d)[ 3] ^= ((uint_8t*)s)[ 3];
+    ((uint_8t*)d)[ 4] ^= ((uint_8t*)s)[ 4];
+    ((uint_8t*)d)[ 5] ^= ((uint_8t*)s)[ 5];
+    ((uint_8t*)d)[ 6] ^= ((uint_8t*)s)[ 6];
+    ((uint_8t*)d)[ 7] ^= ((uint_8t*)s)[ 7];
+    ((uint_8t*)d)[ 8] ^= ((uint_8t*)s)[ 8];
+    ((uint_8t*)d)[ 9] ^= ((uint_8t*)s)[ 9];
+    ((uint_8t*)d)[10] ^= ((uint_8t*)s)[10];
+    ((uint_8t*)d)[11] ^= ((uint_8t*)s)[11];
+    ((uint_8t*)d)[12] ^= ((uint_8t*)s)[12];
+    ((uint_8t*)d)[13] ^= ((uint_8t*)s)[13];
+    ((uint_8t*)d)[14] ^= ((uint_8t*)s)[14];
+    ((uint_8t*)d)[15] ^= ((uint_8t*)s)[15];
+#endif
+}
+
+static void copy_and_key( void *d, const void *s, const void *k )
+{
+#if defined( HAVE_UINT_32T )
+    ((uint_32t*)d)[ 0] = ((uint_32t*)s)[ 0] ^ ((uint_32t*)k)[ 0];
+    ((uint_32t*)d)[ 1] = ((uint_32t*)s)[ 1] ^ ((uint_32t*)k)[ 1];
+    ((uint_32t*)d)[ 2] = ((uint_32t*)s)[ 2] ^ ((uint_32t*)k)[ 2];
+    ((uint_32t*)d)[ 3] = ((uint_32t*)s)[ 3] ^ ((uint_32t*)k)[ 3];
+#elif 1
+    ((uint_8t*)d)[ 0] = ((uint_8t*)s)[ 0] ^ ((uint_8t*)k)[ 0];
+    ((uint_8t*)d)[ 1] = ((uint_8t*)s)[ 1] ^ ((uint_8t*)k)[ 1];
+    ((uint_8t*)d)[ 2] = ((uint_8t*)s)[ 2] ^ ((uint_8t*)k)[ 2];
+    ((uint_8t*)d)[ 3] = ((uint_8t*)s)[ 3] ^ ((uint_8t*)k)[ 3];
+    ((uint_8t*)d)[ 4] = ((uint_8t*)s)[ 4] ^ ((uint_8t*)k)[ 4];
+    ((uint_8t*)d)[ 5] = ((uint_8t*)s)[ 5] ^ ((uint_8t*)k)[ 5];
+    ((uint_8t*)d)[ 6] = ((uint_8t*)s)[ 6] ^ ((uint_8t*)k)[ 6];
+    ((uint_8t*)d)[ 7] = ((uint_8t*)s)[ 7] ^ ((uint_8t*)k)[ 7];
+    ((uint_8t*)d)[ 8] = ((uint_8t*)s)[ 8] ^ ((uint_8t*)k)[ 8];
+    ((uint_8t*)d)[ 9] = ((uint_8t*)s)[ 9] ^ ((uint_8t*)k)[ 9];
+    ((uint_8t*)d)[10] = ((uint_8t*)s)[10] ^ ((uint_8t*)k)[10];
+    ((uint_8t*)d)[11] = ((uint_8t*)s)[11] ^ ((uint_8t*)k)[11];
+    ((uint_8t*)d)[12] = ((uint_8t*)s)[12] ^ ((uint_8t*)k)[12];
+    ((uint_8t*)d)[13] = ((uint_8t*)s)[13] ^ ((uint_8t*)k)[13];
+    ((uint_8t*)d)[14] = ((uint_8t*)s)[14] ^ ((uint_8t*)k)[14];
+    ((uint_8t*)d)[15] = ((uint_8t*)s)[15] ^ ((uint_8t*)k)[15];
+#else
+    block16_copy(d, s);
+    xor_block(d, k);
+#endif
+}
+
+static void add_round_key( uint_8t d[N_BLOCK], const uint_8t k[N_BLOCK] )
+{
+    xor_block(d, k);
+}
+
+static void shift_sub_rows( uint_8t st[N_BLOCK] )
+{   uint_8t tt;
+
+    st[ 0] = s_box[st[ 0]]; st[ 4] = s_box[st[ 4]];
+    st[ 8] = s_box[st[ 8]]; st[12] = s_box[st[12]];
+
+    tt = st[1]; st[ 1] = s_box[st[ 5]]; st[ 5] = s_box[st[ 9]];
+    st[ 9] = s_box[st[13]]; st[13] = s_box[ tt ];
+
+    tt = st[2]; st[ 2] = s_box[st[10]]; st[10] = s_box[ tt ];
+    tt = st[6]; st[ 6] = s_box[st[14]]; st[14] = s_box[ tt ];
+
+    tt = st[15]; st[15] = s_box[st[11]]; st[11] = s_box[st[ 7]];
+    st[ 7] = s_box[st[ 3]]; st[ 3] = s_box[ tt ];
+}
+
+static void inv_shift_sub_rows( uint_8t st[N_BLOCK] )
+{   uint_8t tt;
+
+    st[ 0] = inv_s_box[st[ 0]]; st[ 4] = inv_s_box[st[ 4]];
+    st[ 8] = inv_s_box[st[ 8]]; st[12] = inv_s_box[st[12]];
+
+    tt = st[13]; st[13] = inv_s_box[st[9]]; st[ 9] = inv_s_box[st[5]];
+    st[ 5] = inv_s_box[st[1]]; st[ 1] = inv_s_box[ tt ];
+
+    tt = st[2]; st[ 2] = inv_s_box[st[10]]; st[10] = inv_s_box[ tt ];
+    tt = st[6]; st[ 6] = inv_s_box[st[14]]; st[14] = inv_s_box[ tt ];
+
+    tt = st[3]; st[ 3] = inv_s_box[st[ 7]]; st[ 7] = inv_s_box[st[11]];
+    st[11] = inv_s_box[st[15]]; st[15] = inv_s_box[ tt ];
+}
+
+#if defined( VERSION_1 )
+  static void mix_sub_columns( uint_8t dt[N_BLOCK] )
+  { uint_8t st[N_BLOCK];
+    block16_copy(st, dt);
+#else
+  static void mix_sub_columns( uint_8t dt[N_BLOCK], uint_8t st[N_BLOCK] )
+  {
+#endif
+    dt[ 0] = gfm2_s_box[st[0]] ^ gfm3_s_box[st[5]] ^ s_box[st[10]] ^ s_box[st[15]];
+    dt[ 1] = s_box[st[0]] ^ gfm2_s_box[st[5]] ^ gfm3_s_box[st[10]] ^ s_box[st[15]];
+    dt[ 2] = s_box[st[0]] ^ s_box[st[5]] ^ gfm2_s_box[st[10]] ^ gfm3_s_box[st[15]];
+    dt[ 3] = gfm3_s_box[st[0]] ^ s_box[st[5]] ^ s_box[st[10]] ^ gfm2_s_box[st[15]];
+
+    dt[ 4] = gfm2_s_box[st[4]] ^ gfm3_s_box[st[9]] ^ s_box[st[14]] ^ s_box[st[3]];
+    dt[ 5] = s_box[st[4]] ^ gfm2_s_box[st[9]] ^ gfm3_s_box[st[14]] ^ s_box[st[3]];
+    dt[ 6] = s_box[st[4]] ^ s_box[st[9]] ^ gfm2_s_box[st[14]] ^ gfm3_s_box[st[3]];
+    dt[ 7] = gfm3_s_box[st[4]] ^ s_box[st[9]] ^ s_box[st[14]] ^ gfm2_s_box[st[3]];
+
+    dt[ 8] = gfm2_s_box[st[8]] ^ gfm3_s_box[st[13]] ^ s_box[st[2]] ^ s_box[st[7]];
+    dt[ 9] = s_box[st[8]] ^ gfm2_s_box[st[13]] ^ gfm3_s_box[st[2]] ^ s_box[st[7]];
+    dt[10] = s_box[st[8]] ^ s_box[st[13]] ^ gfm2_s_box[st[2]] ^ gfm3_s_box[st[7]];
+    dt[11] = gfm3_s_box[st[8]] ^ s_box[st[13]] ^ s_box[st[2]] ^ gfm2_s_box[st[7]];
+
+    dt[12] = gfm2_s_box[st[12]] ^ gfm3_s_box[st[1]] ^ s_box[st[6]] ^ s_box[st[11]];
+    dt[13] = s_box[st[12]] ^ gfm2_s_box[st[1]] ^ gfm3_s_box[st[6]] ^ s_box[st[11]];
+    dt[14] = s_box[st[12]] ^ s_box[st[1]] ^ gfm2_s_box[st[6]] ^ gfm3_s_box[st[11]];
+    dt[15] = gfm3_s_box[st[12]] ^ s_box[st[1]] ^ s_box[st[6]] ^ gfm2_s_box[st[11]];
+  }
+
+#if defined( VERSION_1 )
+  static void inv_mix_sub_columns( uint_8t dt[N_BLOCK] )
+  { uint_8t st[N_BLOCK];
+    block16_copy(st, dt);
+#else
+  static void inv_mix_sub_columns( uint_8t dt[N_BLOCK], uint_8t st[N_BLOCK] )
+  {
+#endif
+    dt[ 0] = inv_s_box[gfmul_e[st[ 0]] ^ gfmul_b[st[ 1]] ^ gfmul_d[st[ 2]] ^ gfmul_9[st[ 3]]];
+    dt[ 5] = inv_s_box[gfmul_9[st[ 0]] ^ gfmul_e[st[ 1]] ^ gfmul_b[st[ 2]] ^ gfmul_d[st[ 3]]];
+    dt[10] = inv_s_box[gfmul_d[st[ 0]] ^ gfmul_9[st[ 1]] ^ gfmul_e[st[ 2]] ^ gfmul_b[st[ 3]]];
+    dt[15] = inv_s_box[gfmul_b[st[ 0]] ^ gfmul_d[st[ 1]] ^ gfmul_9[st[ 2]] ^ gfmul_e[st[ 3]]];
+
+    dt[ 4] = inv_s_box[gfmul_e[st[ 4]] ^ gfmul_b[st[ 5]] ^ gfmul_d[st[ 6]] ^ gfmul_9[st[ 7]]];
+    dt[ 9] = inv_s_box[gfmul_9[st[ 4]] ^ gfmul_e[st[ 5]] ^ gfmul_b[st[ 6]] ^ gfmul_d[st[ 7]]];
+    dt[14] = inv_s_box[gfmul_d[st[ 4]] ^ gfmul_9[st[ 5]] ^ gfmul_e[st[ 6]] ^ gfmul_b[st[ 7]]];
+    dt[ 3] = inv_s_box[gfmul_b[st[ 4]] ^ gfmul_d[st[ 5]] ^ gfmul_9[st[ 6]] ^ gfmul_e[st[ 7]]];
+
+    dt[ 8] = inv_s_box[gfmul_e[st[ 8]] ^ gfmul_b[st[ 9]] ^ gfmul_d[st[10]] ^ gfmul_9[st[11]]];
+    dt[13] = inv_s_box[gfmul_9[st[ 8]] ^ gfmul_e[st[ 9]] ^ gfmul_b[st[10]] ^ gfmul_d[st[11]]];
+    dt[ 2] = inv_s_box[gfmul_d[st[ 8]] ^ gfmul_9[st[ 9]] ^ gfmul_e[st[10]] ^ gfmul_b[st[11]]];
+    dt[ 7] = inv_s_box[gfmul_b[st[ 8]] ^ gfmul_d[st[ 9]] ^ gfmul_9[st[10]] ^ gfmul_e[st[11]]];
+
+    dt[12] = inv_s_box[gfmul_e[st[12]] ^ gfmul_b[st[13]] ^ gfmul_d[st[14]] ^ gfmul_9[st[15]]];
+    dt[ 1] = inv_s_box[gfmul_9[st[12]] ^ gfmul_e[st[13]] ^ gfmul_b[st[14]] ^ gfmul_d[st[15]]];
+    dt[ 6] = inv_s_box[gfmul_d[st[12]] ^ gfmul_9[st[13]] ^ gfmul_e[st[14]] ^ gfmul_b[st[15]]];
+    dt[11] = inv_s_box[gfmul_b[st[12]] ^ gfmul_d[st[13]] ^ gfmul_9[st[14]] ^ gfmul_e[st[15]]];
+  }
+
+#if defined( AES_ENC_PREKEYED ) || defined( AES_DEC_PREKEYED )
+
+/*  Set the cipher key for the pre-keyed version */
+
+return_type aes_set_key( const unsigned char key[], length_type keylen, aes_context ctx[1] )
+{
+    uint_8t cc, rc, hi;
+
+    switch( keylen )
+    {
+    case 16:
+    case 128:
+        keylen = 16;
+        break;
+    case 24:
+    case 192:
+        keylen = 24;
+        break;
+    case 32:
+    case 256:
+        keylen = 32;
+        break;
+    default:
+        ctx->rnd = 0;
+        return (return_type) -1;
+    }
+    block_copy(ctx->ksch, key, keylen);
+    hi = (keylen + 28) << 2;
+    ctx->rnd = (hi >> 4) - 1;
+    for( cc = keylen, rc = 1; cc < hi; cc += 4 )
+    {   uint_8t tt, t0, t1, t2, t3;
+
+        t0 = ctx->ksch[cc - 4];
+        t1 = ctx->ksch[cc - 3];
+        t2 = ctx->ksch[cc - 2];
+        t3 = ctx->ksch[cc - 1];
+        if( cc % keylen == 0 )
+        {
+            tt = t0;
+            t0 = s_box[t1] ^ rc;
+            t1 = s_box[t2];
+            t2 = s_box[t3];
+            t3 = s_box[tt];
+            rc = f2(rc);
+        }
+        else if( keylen > 24 && cc % keylen == 16 )
+        {
+            t0 = s_box[t0];
+            t1 = s_box[t1];
+            t2 = s_box[t2];
+            t3 = s_box[t3];
+        }
+        tt = cc - keylen;
+        ctx->ksch[cc + 0] = ctx->ksch[tt + 0] ^ t0;
+        ctx->ksch[cc + 1] = ctx->ksch[tt + 1] ^ t1;
+        ctx->ksch[cc + 2] = ctx->ksch[tt + 2] ^ t2;
+        ctx->ksch[cc + 3] = ctx->ksch[tt + 3] ^ t3;
+    }
+    return 0;
+}
+
+#endif
+
+#if defined( AES_ENC_PREKEYED )
+
+/*  Encrypt a single block of 16 bytes */
+
+return_type aes_encrypt( const unsigned char in[N_BLOCK], unsigned char  out[N_BLOCK], const aes_context ctx[1] )
+{
+    if( ctx->rnd )
+    {
+        uint_8t s1[N_BLOCK], r;
+        copy_and_key( s1, in, ctx->ksch );
+
+        for( r = 1 ; r < ctx->rnd ; ++r )
+#if defined( VERSION_1 )
+        {
+            mix_sub_columns( s1 );
+            add_round_key( s1, ctx->ksch + r * N_BLOCK);
+        }
+#else
+        {   uint_8t s2[N_BLOCK];
+            mix_sub_columns( s2, s1 );
+            copy_and_key( s1, s2, ctx->ksch + r * N_BLOCK);
+        }
+#endif
+        shift_sub_rows( s1 );
+        copy_and_key( out, s1, ctx->ksch + r * N_BLOCK );
+    }
+    else
+        return (return_type) -1;
+    return 0;
+}
+
+#endif
+
+#if defined( AES_DEC_PREKEYED )
+
+/*  Decrypt a single block of 16 bytes */
+
+return_type aes_decrypt( const unsigned char in[N_BLOCK], unsigned char out[N_BLOCK], const aes_context ctx[1] )
+{
+    if( ctx->rnd )
+    {
+        uint_8t s1[N_BLOCK], r;
+        copy_and_key( s1, in, ctx->ksch + ctx->rnd * N_BLOCK );
+        inv_shift_sub_rows( s1 );
+
+        for( r = ctx->rnd ; --r ; )
+#if defined( VERSION_1 )
+        {
+            add_round_key( s1, ctx->ksch + r * N_BLOCK );
+            inv_mix_sub_columns( s1 );
+        }
+#else
+        {   uint_8t s2[N_BLOCK];
+            copy_and_key( s2, s1, ctx->ksch + r * N_BLOCK );
+            inv_mix_sub_columns( s1, s2 );
+        }
+#endif
+        copy_and_key( out, s1, ctx->ksch );
+    }
+    else
+        return (return_type) -1;
+    return 0;
+}
+
+#endif
+
+#if defined( AES_ENC_128_OTFK )
+
+/*  The 'on the fly' encryption key update for for 128 bit keys */
+
+static void update_encrypt_key_128( uint_8t k[N_BLOCK], uint_8t *rc )
+{   uint_8t cc;
+
+    k[0] ^= s_box[k[13]] ^ *rc;
+    k[1] ^= s_box[k[14]];
+    k[2] ^= s_box[k[15]];
+    k[3] ^= s_box[k[12]];
+    *rc = f2( *rc );
+
+    for(cc = 4; cc < 16; cc += 4 )
+    {
+        k[cc + 0] ^= k[cc - 4];
+        k[cc + 1] ^= k[cc - 3];
+        k[cc + 2] ^= k[cc - 2];
+        k[cc + 3] ^= k[cc - 1];
+    }
+}
+
+/*  Encrypt a single block of 16 bytes with 'on the fly' 128 bit keying */
+
+void aes_encrypt_128( const unsigned char in[N_BLOCK], unsigned char out[N_BLOCK],
+                     const unsigned char key[N_BLOCK], unsigned char o_key[N_BLOCK] )
+{   uint_8t s1[N_BLOCK], r, rc = 1;
+
+    if(o_key != key)
+        block16_copy( o_key, key );
+    copy_and_key( s1, in, o_key );
+
+    for( r = 1 ; r < 10 ; ++r )
+#if defined( VERSION_1 )
+    {
+        mix_sub_columns( s1 );
+        update_encrypt_key_128( o_key, &rc );
+        add_round_key( s1, o_key );
+    }
+#else
+    {   uint_8t s2[N_BLOCK];
+        mix_sub_columns( s2, s1 );
+        update_encrypt_key_128( o_key, &rc );
+        copy_and_key( s1, s2, o_key );
+    }
+#endif
+
+    shift_sub_rows( s1 );
+    update_encrypt_key_128( o_key, &rc );
+    copy_and_key( out, s1, o_key );
+}
+
+#endif
+
+#if defined( AES_DEC_128_OTFK )
+
+/*  The 'on the fly' decryption key update for for 128 bit keys */
+
+static void update_decrypt_key_128( uint_8t k[N_BLOCK], uint_8t *rc )
+{   uint_8t cc;
+
+    for( cc = 12; cc > 0; cc -= 4 )
+    {
+        k[cc + 0] ^= k[cc - 4];
+        k[cc + 1] ^= k[cc - 3];
+        k[cc + 2] ^= k[cc - 2];
+        k[cc + 3] ^= k[cc - 1];
+    }
+    *rc = d2(*rc);
+    k[0] ^= s_box[k[13]] ^ *rc;
+    k[1] ^= s_box[k[14]];
+    k[2] ^= s_box[k[15]];
+    k[3] ^= s_box[k[12]];
+}
+
+/*  Decrypt a single block of 16 bytes with 'on the fly' 128 bit keying */
+
+void aes_decrypt_128( const unsigned char in[N_BLOCK], unsigned char out[N_BLOCK],
+                      const unsigned char key[N_BLOCK], unsigned char o_key[N_BLOCK] )
+{
+    uint_8t s1[N_BLOCK], r, rc = 0x6c;
+    if(o_key != key)
+        block16_copy( o_key, key );
+
+    copy_and_key( s1, in, o_key );
+    inv_shift_sub_rows( s1 );
+
+    for( r = 10 ; --r ; )
+#if defined( VERSION_1 )
+    {
+        update_decrypt_key_128( o_key, &rc );
+        add_round_key( s1, o_key );
+        inv_mix_sub_columns( s1 );
+    }
+#else
+    {   uint_8t s2[N_BLOCK];
+        update_decrypt_key_128( o_key, &rc );
+        copy_and_key( s2, s1, o_key );
+        inv_mix_sub_columns( s1, s2 );
+    }
+#endif
+    update_decrypt_key_128( o_key, &rc );
+    copy_and_key( out, s1, o_key );
+}
+
+#endif
+
+#if defined( AES_ENC_256_OTFK )
+
+/*  The 'on the fly' encryption key update for for 256 bit keys */
+
+static void update_encrypt_key_256( uint_8t k[2 * N_BLOCK], uint_8t *rc )
+{   uint_8t cc;
+
+    k[0] ^= s_box[k[29]] ^ *rc;
+    k[1] ^= s_box[k[30]];
+    k[2] ^= s_box[k[31]];
+    k[3] ^= s_box[k[28]];
+    *rc = f2( *rc );
+
+    for(cc = 4; cc < 16; cc += 4)
+    {
+        k[cc + 0] ^= k[cc - 4];
+        k[cc + 1] ^= k[cc - 3];
+        k[cc + 2] ^= k[cc - 2];
+        k[cc + 3] ^= k[cc - 1];
+    }
+
+    k[16] ^= s_box[k[12]];
+    k[17] ^= s_box[k[13]];
+    k[18] ^= s_box[k[14]];
+    k[19] ^= s_box[k[15]];
+
+    for( cc = 20; cc < 32; cc += 4 )
+    {
+        k[cc + 0] ^= k[cc - 4];
+        k[cc + 1] ^= k[cc - 3];
+        k[cc + 2] ^= k[cc - 2];
+        k[cc + 3] ^= k[cc - 1];
+    }
+}
+
+/*  Encrypt a single block of 16 bytes with 'on the fly' 256 bit keying */
+
+void aes_encrypt_256( const unsigned char in[N_BLOCK], unsigned char out[N_BLOCK],
+                      const unsigned char key[2 * N_BLOCK], unsigned char o_key[2 * N_BLOCK] )
+{
+    uint_8t s1[N_BLOCK], r, rc = 1;
+    if(o_key != key)
+    {
+        block16_copy( o_key, key );
+        block16_copy( o_key + 16, key + 16 );
+    }
+    copy_and_key( s1, in, o_key );
+
+    for( r = 1 ; r < 14 ; ++r )
+#if defined( VERSION_1 )
+    {
+        mix_sub_columns(s1);
+        if( r & 1 )
+            add_round_key( s1, o_key + 16 );
+        else
+        {
+            update_encrypt_key_256( o_key, &rc );
+            add_round_key( s1, o_key );
+        }
+    }
+#else
+    {   uint_8t s2[N_BLOCK];
+        mix_sub_columns( s2, s1 );
+        if( r & 1 )
+            copy_and_key( s1, s2, o_key + 16 );
+        else
+        {
+            update_encrypt_key_256( o_key, &rc );
+            copy_and_key( s1, s2, o_key );
+        }
+    }
+#endif
+
+    shift_sub_rows( s1 );
+    update_encrypt_key_256( o_key, &rc );
+    copy_and_key( out, s1, o_key );
+}
+
+#endif
+
+#if defined( AES_DEC_256_OTFK )
+
+/*  The 'on the fly' encryption key update for for 256 bit keys */
+
+static void update_decrypt_key_256( uint_8t k[2 * N_BLOCK], uint_8t *rc )
+{   uint_8t cc;
+
+    for(cc = 28; cc > 16; cc -= 4)
+    {
+        k[cc + 0] ^= k[cc - 4];
+        k[cc + 1] ^= k[cc - 3];
+        k[cc + 2] ^= k[cc - 2];
+        k[cc + 3] ^= k[cc - 1];
+    }
+
+    k[16] ^= s_box[k[12]];
+    k[17] ^= s_box[k[13]];
+    k[18] ^= s_box[k[14]];
+    k[19] ^= s_box[k[15]];
+
+    for(cc = 12; cc > 0; cc -= 4)
+    {
+        k[cc + 0] ^= k[cc - 4];
+        k[cc + 1] ^= k[cc - 3];
+        k[cc + 2] ^= k[cc - 2];
+        k[cc + 3] ^= k[cc - 1];
+    }
+
+    *rc = d2(*rc);
+    k[0] ^= s_box[k[29]] ^ *rc;
+    k[1] ^= s_box[k[30]];
+    k[2] ^= s_box[k[31]];
+    k[3] ^= s_box[k[28]];
+}
+
+/*  Decrypt a single block of 16 bytes with 'on the fly'
+    256 bit keying
+*/
+void aes_decrypt_256( const unsigned char in[N_BLOCK], unsigned char out[N_BLOCK],
+                      const unsigned char key[2 * N_BLOCK], unsigned char o_key[2 * N_BLOCK] )
+{
+    uint_8t s1[N_BLOCK], r, rc = 0x80;
+
+    if(o_key != key)
+    {
+        block16_copy( o_key, key );
+        block16_copy( o_key + 16, key + 16 );
+    }
+
+    copy_and_key( s1, in, o_key );
+    inv_shift_sub_rows( s1 );
+
+    for( r = 14 ; --r ; )
+#if defined( VERSION_1 )
+    {
+        if( ( r & 1 ) )
+        {
+            update_decrypt_key_256( o_key, &rc );
+            add_round_key( s1, o_key + 16 );
+        }
+        else
+            add_round_key( s1, o_key );
+        inv_mix_sub_columns( s1 );
+    }
+#else
+    {   uint_8t s2[N_BLOCK];
+        if( ( r & 1 ) )
+        {
+            update_decrypt_key_256( o_key, &rc );
+            copy_and_key( s2, s1, o_key + 16 );
+        }
+        else
+            copy_and_key( s2, s1, o_key );
+        inv_mix_sub_columns( s1, s2 );
+    }
+#endif
+    copy_and_key( out, s1, o_key );
+}
+
+#endif
diff --git a/src/Crypto/AesSmall.h b/src/Crypto/AesSmall.h
index 516c6964..ebeb24ef 100644
--- a/src/Crypto/AesSmall.h
+++ b/src/Crypto/AesSmall.h
@@ -1,169 +1,169 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software in both source and binary
- form is allowed (with or without changes) provided that:
-
-   1. distributions of this source code include the above copyright
-      notice, this list of conditions and the following disclaimer;
-
-   2. distributions in binary form include the above copyright
-      notice, this list of conditions and the following disclaimer
-      in the documentation and/or other associated materials;
-
-   3. the copyright holder's name is not used to endorse products
-      built using this software without specific written permission.
-
- ALTERNATIVELY, provided that this notice is retained in full, this product
- may be distributed under the terms of the GNU General Public License (GPL),
- in which case the provisions of the GPL apply INSTEAD OF those given above.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue 09/09/2006
-
- This is an AES implementation that uses only 8-bit byte operations on the
- cipher state.
- */
-
-#ifndef AES_H
-#define AES_H
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-/*  This provides speed optimisation opportunities if 32-bit word
-    operations are available
-*/
-#if 1
-#  define HAVE_UINT_32T
-#endif
-
-#if 1
-#  define AES_ENC_PREKEYED  /* AES encryption with a precomputed key schedule  */
-#endif
-#if 1
-#  define AES_DEC_PREKEYED  /* AES decryption with a precomputed key schedule  */
-#endif
-#if 0
-#  define AES_ENC_128_OTFK  /* AES encryption with 'on the fly' 128 bit keying */
-#endif
-#if 0
-#  define AES_DEC_128_OTFK  /* AES decryption with 'on the fly' 128 bit keying */
-#endif
-#if 0
-#  define AES_ENC_256_OTFK  /* AES encryption with 'on the fly' 256 bit keying */
-#endif
-#if 0
-#  define AES_DEC_256_OTFK  /* AES decryption with 'on the fly' 256 bit keying */
-#endif
-
-#define N_ROW                   4
-#define N_COL                   4
-#define N_BLOCK   (N_ROW * N_COL)
-#define N_MAX_ROUNDS           14
-
-typedef unsigned char uint_8t;
-
-typedef uint_8t return_type;
-typedef uint_8t length_type;
-typedef uint_8t uint_type;
-
-typedef unsigned char uint_8t;
-
-typedef struct
-{   uint_8t ksch[(N_MAX_ROUNDS + 1) * N_BLOCK];
-    uint_8t rnd;
-} aes_context;
-
-/*  The following calls are for a precomputed key schedule
-
-    NOTE: If the length_type used for the key length is an
-    unsigned 8-bit character, a key length of 256 bits must
-    be entered as a length in bytes (valid inputs are hence
-    128, 192, 16, 24 and 32).
-*/
-
-#if defined( AES_ENC_PREKEYED ) || defined( AES_DEC_PREKEYED )
-
-return_type aes_set_key( const unsigned char key[],
-                         length_type keylen,
-                         aes_context ctx[1] );
-#endif
-
-#if defined( AES_ENC_PREKEYED )
-
-return_type aes_encrypt( const unsigned char in[N_BLOCK],
-                         unsigned char out[N_BLOCK],
-                         const aes_context ctx[1] );
-#endif
-
-#if defined( AES_DEC_PREKEYED )
-
-return_type aes_decrypt( const unsigned char in[N_BLOCK],
-                         unsigned char out[N_BLOCK],
-                         const aes_context ctx[1] );
-#endif
-
-/*  The following calls are for 'on the fly' keying.  In this case the
-    encryption and decryption keys are different.
-
-    The encryption subroutines take a key in an array of bytes in
-    key[L] where L is 16, 24 or 32 bytes for key lengths of 128,
-    192, and 256 bits respectively.  They then encrypts the input
-    data, in[] with this key and put the reult in the output array
-    out[].  In addition, the second key array, o_key[L], is used
-    to output the key that is needed by the decryption subroutine
-    to reverse the encryption operation.  The two key arrays can
-    be the same array but in this case the original key will be
-    overwritten.
-
-    In the same way, the decryption subroutines output keys that
-    can be used to reverse their effect when used for encryption.
-
-    Only 128 and 256 bit keys are supported in these 'on the fly'
-    modes.
-*/
-
-#if defined( AES_ENC_128_OTFK )
-void aes_encrypt_128( const unsigned char in[N_BLOCK],
-                      unsigned char out[N_BLOCK],
-                      const unsigned char key[N_BLOCK],
-                      uint_8t o_key[N_BLOCK] );
-#endif
-
-#if defined( AES_DEC_128_OTFK )
-void aes_decrypt_128( const unsigned char in[N_BLOCK],
-                      unsigned char out[N_BLOCK],
-                      const unsigned char key[N_BLOCK],
-                      unsigned char o_key[N_BLOCK] );
-#endif
-
-#if defined( AES_ENC_256_OTFK )
-void aes_encrypt_256( const unsigned char in[N_BLOCK],
-                      unsigned char out[N_BLOCK],
-                      const unsigned char key[2 * N_BLOCK],
-                      unsigned char o_key[2 * N_BLOCK] );
-#endif
-
-#if defined( AES_DEC_256_OTFK )
-void aes_decrypt_256( const unsigned char in[N_BLOCK],
-                      unsigned char out[N_BLOCK],
-                      const unsigned char key[2 * N_BLOCK],
-                      unsigned char o_key[2 * N_BLOCK] );
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 09/09/2006
+
+ This is an AES implementation that uses only 8-bit byte operations on the
+ cipher state.
+ */
+
+#ifndef AES_H
+#define AES_H
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+/*  This provides speed optimisation opportunities if 32-bit word
+    operations are available
+*/
+#if 1
+#  define HAVE_UINT_32T
+#endif
+
+#if 1
+#  define AES_ENC_PREKEYED  /* AES encryption with a precomputed key schedule  */
+#endif
+#if 1
+#  define AES_DEC_PREKEYED  /* AES decryption with a precomputed key schedule  */
+#endif
+#if 0
+#  define AES_ENC_128_OTFK  /* AES encryption with 'on the fly' 128 bit keying */
+#endif
+#if 0
+#  define AES_DEC_128_OTFK  /* AES decryption with 'on the fly' 128 bit keying */
+#endif
+#if 0
+#  define AES_ENC_256_OTFK  /* AES encryption with 'on the fly' 256 bit keying */
+#endif
+#if 0
+#  define AES_DEC_256_OTFK  /* AES decryption with 'on the fly' 256 bit keying */
+#endif
+
+#define N_ROW                   4
+#define N_COL                   4
+#define N_BLOCK   (N_ROW * N_COL)
+#define N_MAX_ROUNDS           14
+
+typedef unsigned char uint_8t;
+
+typedef uint_8t return_type;
+typedef uint_8t length_type;
+typedef uint_8t uint_type;
+
+typedef unsigned char uint_8t;
+
+typedef struct
+{   uint_8t ksch[(N_MAX_ROUNDS + 1) * N_BLOCK];
+    uint_8t rnd;
+} aes_context;
+
+/*  The following calls are for a precomputed key schedule
+
+    NOTE: If the length_type used for the key length is an
+    unsigned 8-bit character, a key length of 256 bits must
+    be entered as a length in bytes (valid inputs are hence
+    128, 192, 16, 24 and 32).
+*/
+
+#if defined( AES_ENC_PREKEYED ) || defined( AES_DEC_PREKEYED )
+
+return_type aes_set_key( const unsigned char key[],
+                         length_type keylen,
+                         aes_context ctx[1] );
+#endif
+
+#if defined( AES_ENC_PREKEYED )
+
+return_type aes_encrypt( const unsigned char in[N_BLOCK],
+                         unsigned char out[N_BLOCK],
+                         const aes_context ctx[1] );
+#endif
+
+#if defined( AES_DEC_PREKEYED )
+
+return_type aes_decrypt( const unsigned char in[N_BLOCK],
+                         unsigned char out[N_BLOCK],
+                         const aes_context ctx[1] );
+#endif
+
+/*  The following calls are for 'on the fly' keying.  In this case the
+    encryption and decryption keys are different.
+
+    The encryption subroutines take a key in an array of bytes in
+    key[L] where L is 16, 24 or 32 bytes for key lengths of 128,
+    192, and 256 bits respectively.  They then encrypts the input
+    data, in[] with this key and put the reult in the output array
+    out[].  In addition, the second key array, o_key[L], is used
+    to output the key that is needed by the decryption subroutine
+    to reverse the encryption operation.  The two key arrays can
+    be the same array but in this case the original key will be
+    overwritten.
+
+    In the same way, the decryption subroutines output keys that
+    can be used to reverse their effect when used for encryption.
+
+    Only 128 and 256 bit keys are supported in these 'on the fly'
+    modes.
+*/
+
+#if defined( AES_ENC_128_OTFK )
+void aes_encrypt_128( const unsigned char in[N_BLOCK],
+                      unsigned char out[N_BLOCK],
+                      const unsigned char key[N_BLOCK],
+                      uint_8t o_key[N_BLOCK] );
+#endif
+
+#if defined( AES_DEC_128_OTFK )
+void aes_decrypt_128( const unsigned char in[N_BLOCK],
+                      unsigned char out[N_BLOCK],
+                      const unsigned char key[N_BLOCK],
+                      unsigned char o_key[N_BLOCK] );
+#endif
+
+#if defined( AES_ENC_256_OTFK )
+void aes_encrypt_256( const unsigned char in[N_BLOCK],
+                      unsigned char out[N_BLOCK],
+                      const unsigned char key[2 * N_BLOCK],
+                      unsigned char o_key[2 * N_BLOCK] );
+#endif
+
+#if defined( AES_DEC_256_OTFK )
+void aes_decrypt_256( const unsigned char in[N_BLOCK],
+                      unsigned char out[N_BLOCK],
+                      const unsigned char key[2 * N_BLOCK],
+                      unsigned char o_key[2 * N_BLOCK] );
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/src/Crypto/AesSmall_x86.asm b/src/Crypto/AesSmall_x86.asm
index fe7dc47b..de32fc66 100644
--- a/src/Crypto/AesSmall_x86.asm
+++ b/src/Crypto/AesSmall_x86.asm
@@ -1,1444 +1,1444 @@
-
-; ---------------------------------------------------------------------------
-; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
-; 
-; LICENSE TERMS
-; 
-; The free distribution and use of this software is allowed (with or without
-; changes) provided that:
-; 
-;  1. source code distributions include the above copyright notice, this
-;     list of conditions and the following disclaimer;
-; 
-;  2. binary distributions include the above copyright notice, this list
-;     of conditions and the following disclaimer in their documentation;
-; 
-;  3. the name of the copyright holder is not used to endorse products
-;     built using this software without specific written permission.
-; 
-; DISCLAIMER
-; 
-; This software is provided 'as is' with no explicit or implied warranties
-; in respect of its properties, including, but not limited to, correctness
-; and/or fitness for purpose.
-; ---------------------------------------------------------------------------
-; Issue 20/12/2007
-;
-; This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h
-; and the same define to be set here as well. If AES_V2C is set this file
-; requires the C files aeskey.c and aestab.c for support.
-
-; An AES implementation for x86 processors using the YASM (or NASM) assembler.
-; This is a full assembler implementation covering encryption, decryption and
-; key scheduling. It uses 2k bytes of tables but its encryption and decryption
-; performance is very close to that obtained using large tables.  Key schedule
-; expansion is slower for both encryption and decryption but this is likely to
-; be offset by the much smaller load that this version places on the processor
-; cache. I acknowledge the contribution made by Daniel Bernstein to aspects of
-; the design of the AES round function used here.
-;
-; This code provides the standard AES block size (128 bits, 16 bytes) and the
-; three standard AES key sizes (128, 192 and 256 bits). It has the same call
-; interface as my C implementation. The ebx, esi, edi and ebp registers are
-; preserved across calls but eax, ecx and edx and the artihmetic status flags
-; are not.  Although this is a full assembler implementation, it can be used
-; in conjunction with my C code which provides faster key scheduling using
-; large tables. In this case aeskey.c should be compiled with ASM_X86_V2C
-; defined.  It is also important that the defines below match those used in the
-; C code.  This code uses the VC++ register saving conentions; if it is used
-; with another compiler, conventions for using and saving registers may need
-; to be checked (and calling conventions).  The YASM command line for the VC++
-; custom build step is:
-;
-;    yasm -Xvc -f win32 -D <Z> -o "$(TargetDir)\$(InputName).obj" "$(InputPath)"
-;
-; For the cryptlib build this is (pcg):
-;
-;	yasm -Xvc -f win32 -D ASM_X86_V2C -o aescrypt2.obj aes_x86_v2.asm
-;
-; where <Z> is ASM_X86_V2 or ASM_X86_V2C.  The calling intefaces are:
-;
-;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
-;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
-;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
-;                                            const aes_encrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
-;                                            const aes_decrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_encrypt_key(const unsigned char key[],
-;                           unsigned int len, const aes_decrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_decrypt_key(const unsigned char key[],
-;                           unsigned int len, const aes_decrypt_ctx cx[1]);
-;
-; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
-; either bits or bytes.
-
-; The DLL interface must use the _stdcall convention in which the number
-; of bytes of parameter space is added after an @ to the sutine's name.
-; We must also remove our parameters from the stack before return (see
-; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
-
-;
-; Adapted for TrueCrypt:
-; - All tables generated at run-time
-; - Adapted for 16-bit environment
-;
-
-CPU 386
-USE16
-SEGMENT _TEXT PUBLIC CLASS=CODE USE16
-SEGMENT _DATA PUBLIC CLASS=DATA USE16
-
-GROUP DGROUP _TEXT _DATA
-
-extern _aes_dec_tab		; Aestab.c
-extern _aes_enc_tab
-
-; %define DLL_EXPORT
-
-; The size of the code can be reduced by using functions for the encryption
-; and decryption rounds in place of macro expansion
-
-%define REDUCE_CODE_SIZE
-
-; Comment in/out the following lines to obtain the desired subroutines. These
-; selections MUST match those in the C header file aes.h
-
-; %define AES_128                 ; define if AES with 128 bit keys is needed
-; %define AES_192                 ; define if AES with 192 bit keys is needed
-%define AES_256                 ; define if AES with 256 bit keys is needed
-; %define AES_VAR                 ; define if a variable key size is needed
-%define ENCRYPTION              ; define if encryption is needed
-%define DECRYPTION              ; define if decryption is needed
-; %define AES_REV_DKS             ; define if key decryption schedule is reversed
-
-%ifndef ASM_X86_V2C
-%define ENCRYPTION_KEY_SCHEDULE ; define if encryption key expansion is needed
-%define DECRYPTION_KEY_SCHEDULE ; define if decryption key expansion is needed
-%endif
-
-; The encryption key schedule has the following in memory layout where N is the
-; number of rounds (10, 12 or 14):
-;
-; lo: | input key (round 0)  |  ; each round is four 32-bit words
-;     | encryption round 1   |
-;     | encryption round 2   |
-;     ....
-;     | encryption round N-1 |
-; hi: | encryption round N   |
-;
-; The decryption key schedule is normally set up so that it has the same
-; layout as above by actually reversing the order of the encryption key
-; schedule in memory (this happens when AES_REV_DKS is set):
-;
-; lo: | decryption round 0   | =              | encryption round N   |
-;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
-;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
-;     ....                       ....
-;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
-; hi: | decryption round N   | =              | input key (round 0)  |
-;
-; with rounds except the first and last modified using inv_mix_column()
-; But if AES_REV_DKS is NOT set the order of keys is left as it is for
-; encryption so that it has to be accessed in reverse when used for
-; decryption (although the inverse mix column modifications are done)
-;
-; lo: | decryption round 0   | =              | input key (round 0)  |
-;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
-;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
-;     ....                       ....
-;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
-; hi: | decryption round N   | =              | encryption round N   |
-;
-; This layout is faster when the assembler key scheduling provided here
-; is used.
-;
-; End of user defines
-
-%ifdef AES_VAR
-%ifndef AES_128
-%define AES_128
-%endif
-%ifndef AES_192
-%define AES_192
-%endif
-%ifndef AES_256
-%define AES_256
-%endif
-%endif
-
-%ifdef AES_VAR
-%define KS_LENGTH       60
-%elifdef AES_256
-%define KS_LENGTH       60
-%elifdef AES_192
-%define KS_LENGTH       52
-%else
-%define KS_LENGTH       44
-%endif
-
-; These macros implement stack based local variables
-
-%macro  save 2
-    mov     [esp+4*%1],%2
-%endmacro
-
-%macro  restore 2
-    mov     %1,[esp+4*%2]
-%endmacro
-
-%ifdef  REDUCE_CODE_SIZE
-    %macro mf_call 1
-        call %1
-    %endmacro
-%else
-    %macro mf_call 1
-        %1
-    %endmacro
-%endif
-
-; the DLL has to implement the _stdcall calling interface on return
-; In this case we have to take our parameters (3 4-byte pointers)
-; off the stack
-
-%define parms 12
-
-%macro  do_name 1-2 parms
-%ifndef DLL_EXPORT
-    global  %1
-%1:
-%else
-    global  %1@%2
-    export  %1@%2
-%1@%2:
-%endif
-%endmacro
-
-%macro  do_call 1-2 parms
-%ifndef DLL_EXPORT
-    call    %1
-    add     esp,%2
-%else
-    call    %1@%2
-%endif
-%endmacro
-
-%macro  do_exit  0-1 parms
-%ifdef DLL_EXPORT
-    ret %1
-%else
-    ret
-%endif
-%endmacro
-
-; finite field multiplies by {02}, {04} and {08}
-
-%define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
-%define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
-%define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
-
-; finite field multiplies required in table generation
-
-%define f3(x)   (f2(x) ^ x)
-%define f9(x)   (f8(x) ^ x)
-%define fb(x)   (f8(x) ^ f2(x) ^ x)
-%define fd(x)   (f8(x) ^ f4(x) ^ x)
-%define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
-
-%define etab_0(x)   [_aes_enc_tab+4+8*x]
-%define etab_1(x)   [_aes_enc_tab+3+8*x]
-%define etab_2(x)   [_aes_enc_tab+2+8*x]
-%define etab_3(x)   [_aes_enc_tab+1+8*x]
-%define etab_b(x)   byte [_aes_enc_tab+1+8*x] ; used with movzx for 0x000000xx
-%define etab_w(x)   word [_aes_enc_tab+8*x]   ; used with movzx for 0x0000xx00
-
-%define btab_0(x)   [_aes_enc_tab+6+8*x]
-%define btab_1(x)   [_aes_enc_tab+5+8*x]
-%define btab_2(x)   [_aes_enc_tab+4+8*x]
-%define btab_3(x)   [_aes_enc_tab+3+8*x]
-
-; ROUND FUNCTION.  Build column[2] on ESI and column[3] on EDI that have the
-; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX.
-;
-; Input:
-;
-;   EAX     column[0]
-;   EBX     column[1]
-;   ECX     column[2]
-;   EDX     column[3]
-;   ESI     column key[round][2]
-;   EDI     column key[round][3]
-;   EBP     scratch
-;
-; Output:
-;
-;   EBP     column[0]   unkeyed
-;   EBX     column[1]   unkeyed
-;   ESI     column[2]   keyed
-;   EDI     column[3]   keyed
-;   EAX     scratch
-;   ECX     scratch
-;   EDX     scratch
-
-%macro rnd_fun 2
-
-    rol     ebx,16
-    %1      esi, cl, 0, ebp
-    %1      esi, dh, 1, ebp
-    %1      esi, bh, 3, ebp
-    %1      edi, dl, 0, ebp
-    %1      edi, ah, 1, ebp
-    %1      edi, bl, 2, ebp
-    %2      ebp, al, 0, ebp
-    shr     ebx,16
-    and     eax,0xffff0000
-    or      eax,ebx
-    shr     edx,16
-    %1      ebp, ah, 1, ebx
-    %1      ebp, dh, 3, ebx
-    %2      ebx, dl, 2, ebx
-    %1      ebx, ch, 1, edx
-    %1      ebx, al, 0, edx
-    shr     eax,16
-    shr     ecx,16
-    %1      ebp, cl, 2, edx
-    %1      edi, ch, 3, edx
-    %1      esi, al, 2, edx
-    %1      ebx, ah, 3, edx
-
-%endmacro
-
-; Basic MOV and XOR Operations for normal rounds
-
-%macro  nr_xor  4
-    movzx   %4,%2
-    xor     %1,etab_%3(%4)
-%endmacro
-
-%macro  nr_mov  4
-    movzx   %4,%2
-    mov     %1,etab_%3(%4)
-%endmacro
-
-; Basic MOV and XOR Operations for last round
-
-%if 1
-
-    %macro  lr_xor  4
-        movzx   %4,%2
-        movzx   %4,etab_b(%4)
-    %if %3 != 0
-        shl     %4,8*%3
-    %endif
-        xor     %1,%4
-    %endmacro
-
-    %macro  lr_mov  4
-        movzx   %4,%2
-        movzx   %1,etab_b(%4)
-    %if %3 != 0
-        shl     %1,8*%3
-    %endif
-    %endmacro
-
-%else       ; less effective but worth leaving as an option
-
-    %macro  lr_xor  4
-        movzx   %4,%2
-        mov     %4,btab_%3(%4)
-        and     %4,0x000000ff << 8 * %3
-        xor     %1,%4
-    %endmacro
-
-    %macro  lr_mov  4
-        movzx   %4,%2
-        mov     %1,btab_%3(%4)
-        and     %1,0x000000ff << 8 * %3
-    %endmacro
-
-%endif
-
-; Apply S-Box to the 4 bytes in a 32-bit word and rotate byte positions
-
-%ifdef REDUCE_CODE_SIZE
-    
-l3s_col:
-    movzx   ecx,al              ; in      eax
-    movzx   ecx, etab_b(ecx)    ; out     eax
-    xor     edx,ecx             ; scratch ecx,edx
-    movzx   ecx,ah
-    movzx   ecx, etab_b(ecx)
-    shl     ecx,8
-    xor     edx,ecx
-    shr     eax,16
-    movzx   ecx,al
-    movzx   ecx, etab_b(ecx)
-    shl     ecx,16
-    xor     edx,ecx
-    movzx   ecx,ah
-    movzx   ecx, etab_b(ecx)
-    shl     ecx,24
-    xor     edx,ecx
-    mov     eax,edx
-    ret
-
-%else
-
-%macro l3s_col 0
-
-    movzx   ecx,al              ; in      eax
-    movzx   ecx, etab_b(ecx)    ; out     eax
-    xor     edx,ecx             ; scratch ecx,edx
-    movzx   ecx,ah
-    movzx   ecx, etab_b(ecx)
-    shl     ecx,8
-    xor     edx,ecx
-    shr     eax,16
-    movzx   ecx,al
-    movzx   ecx, etab_b(ecx)
-    shl     ecx,16
-    xor     edx,ecx
-    movzx   ecx,ah
-    movzx   ecx, etab_b(ecx)
-    shl     ecx,24
-    xor     edx,ecx
-    mov     eax,edx
-
-%endmacro
-
-%endif
-    
-; offsets to parameters
-
-in_blk  equ     2   ; input byte array address parameter
-out_blk equ     4   ; output byte array address parameter
-ctx     equ     6   ; AES context structure
-stk_spc equ    20   ; stack space
-
-%ifdef  ENCRYPTION
-
-; %define ENCRYPTION_TABLE
-
-%ifdef REDUCE_CODE_SIZE
-
-enc_round:
-	sub		sp, 2
-    add     ebp,16
-    save    1,ebp
-    mov     esi,[ebp+8]
-    mov     edi,[ebp+12]
-
-    rnd_fun nr_xor, nr_mov
-
-    mov     eax,ebp
-    mov     ecx,esi
-    mov     edx,edi
-    restore ebp,1
-    xor     eax,[ebp]
-    xor     ebx,[ebp+4]
-	add		sp, 2
-    ret
-    
-%else
-
-%macro enc_round 0
-
-    add     ebp,16
-    save    0,ebp
-    mov     esi,[ebp+8]
-    mov     edi,[ebp+12]
-
-    rnd_fun nr_xor, nr_mov
-
-    mov     eax,ebp
-    mov     ecx,esi
-    mov     edx,edi
-    restore ebp,0
-    xor     eax,[ebp]
-    xor     ebx,[ebp+4]
-
-%endmacro
-
-%endif
-
-%macro enc_last_round 0
-
-    add     ebp,16
-    save    0,ebp
-    mov     esi,[ebp+8]
-    mov     edi,[ebp+12]
-
-    rnd_fun lr_xor, lr_mov
-
-    mov     eax,ebp
-    restore ebp,0
-    xor     eax,[ebp]
-    xor     ebx,[ebp+4]
-
-%endmacro
-
-    section _TEXT
-
-; AES Encryption Subroutine
-
-    do_name _aes_encrypt,12
-
-	mov		ax, sp
-	movzx	esp, ax
-
-    sub     esp,stk_spc
-    mov     [esp+16],ebp
-    mov     [esp+12],ebx
-    mov     [esp+ 8],esi
-    mov     [esp+ 4],edi
-
-    movzx   esi,word [esp+in_blk+stk_spc] ; input pointer
-    mov     eax,[esi   ]
-    mov     ebx,[esi+ 4]
-    mov     ecx,[esi+ 8]
-    mov     edx,[esi+12]
-
-    movzx   ebp,word [esp+ctx+stk_spc]    ; key pointer
-    movzx   edi,byte [ebp+4*KS_LENGTH]
-    xor     eax,[ebp   ]
-    xor     ebx,[ebp+ 4]
-    xor     ecx,[ebp+ 8]
-    xor     edx,[ebp+12]
-
-; determine the number of rounds
-
-%ifndef AES_256
-    cmp     edi,10*16
-    je      .3
-    cmp     edi,12*16
-    je      .2
-    cmp     edi,14*16
-    je      .1
-    mov     eax,-1
-    jmp     .5
-%endif
-
-.1: mf_call enc_round
-    mf_call enc_round
-.2: mf_call enc_round
-    mf_call enc_round
-.3: mf_call enc_round
-    mf_call enc_round
-    mf_call enc_round
-    mf_call enc_round
-    mf_call enc_round
-    mf_call enc_round
-    mf_call enc_round
-    mf_call enc_round
-    mf_call enc_round
-    enc_last_round
-
-    movzx   edx,word [esp+out_blk+stk_spc]
-    mov     [edx],eax
-    mov     [edx+4],ebx
-    mov     [edx+8],esi
-    mov     [edx+12],edi
-    xor     eax,eax
-
-.5: mov     ebp,[esp+16]
-    mov     ebx,[esp+12]
-    mov     esi,[esp+ 8]
-    mov     edi,[esp+ 4]
-    add     esp,stk_spc
-    do_exit 12
-
-%endif
-
-%macro f_key 2
-
-    push    ecx
-    push    edx
-    mov     edx,esi
-    ror     eax,8
-    mf_call l3s_col
-    mov     esi,eax
-    pop     edx
-    pop     ecx
-    xor     esi,rc_val
-
-    mov     [ebp+%1*%2],esi
-    xor     edi,esi
-    mov     [ebp+%1*%2+4],edi
-    xor     ecx,edi
-    mov     [ebp+%1*%2+8],ecx
-    xor     edx,ecx
-    mov     [ebp+%1*%2+12],edx
-    mov     eax,edx
-
-%if %2 == 24
-
-%if %1 < 7
-    xor     eax,[ebp+%1*%2+16-%2]
-    mov     [ebp+%1*%2+16],eax
-    xor     eax,[ebp+%1*%2+20-%2]
-    mov     [ebp+%1*%2+20],eax
-%endif
-
-%elif %2 == 32
-
-%if %1 < 6
-    push    ecx
-    push    edx
-    mov     edx,[ebp+%1*%2+16-%2]
-    mf_call l3s_col
-    pop     edx
-    pop     ecx
-    mov     [ebp+%1*%2+16],eax
-    xor     eax,[ebp+%1*%2+20-%2]
-    mov     [ebp+%1*%2+20],eax
-    xor     eax,[ebp+%1*%2+24-%2]
-    mov     [ebp+%1*%2+24],eax
-    xor     eax,[ebp+%1*%2+28-%2]
-    mov     [ebp+%1*%2+28],eax
-%endif
-
-%endif
-
-%assign rc_val f2(rc_val)
-
-%endmacro
-
-%ifdef ENCRYPTION_KEY_SCHEDULE
-
-%ifdef  AES_128
-
-%ifndef ENCRYPTION_TABLE
-; %define ENCRYPTION_TABLE
-%endif
-
-%assign rc_val  1
-
-    do_name _aes_encrypt_key128,8
-
-    push    ebp
-    push    ebx
-    push    esi
-    push    edi
-
-    mov     ebp,[esp+24]
-    mov     [ebp+4*KS_LENGTH],dword 10*16
-    mov     ebx,[esp+20]
-
-    mov     esi,[ebx]
-    mov     [ebp],esi
-    mov     edi,[ebx+4]
-    mov     [ebp+4],edi
-    mov     ecx,[ebx+8]
-    mov     [ebp+8],ecx
-    mov     edx,[ebx+12]
-    mov     [ebp+12],edx
-    add     ebp,16
-    mov     eax,edx
-
-    f_key   0,16        ; 11 * 4 = 44 unsigned longs
-    f_key   1,16        ; 4 + 4 * 10 generated = 44
-    f_key   2,16
-    f_key   3,16
-    f_key   4,16
-    f_key   5,16
-    f_key   6,16
-    f_key   7,16
-    f_key   8,16
-    f_key   9,16
-
-    pop     edi
-    pop     esi
-    pop     ebx
-    pop     ebp
-    xor     eax,eax
-    do_exit  8
-
-%endif
-
-%ifdef  AES_192
-
-%ifndef ENCRYPTION_TABLE
-; %define ENCRYPTION_TABLE
-%endif
-
-%assign rc_val  1
-
-    do_name _aes_encrypt_key192,8
-
-    push    ebp
-    push    ebx
-    push    esi
-    push    edi
-
-    mov     ebp,[esp+24]
-    mov     [ebp+4*KS_LENGTH],dword 12 * 16
-    mov     ebx,[esp+20]
-
-    mov     esi,[ebx]
-    mov     [ebp],esi
-    mov     edi,[ebx+4]
-    mov     [ebp+4],edi
-    mov     ecx,[ebx+8]
-    mov     [ebp+8],ecx
-    mov     edx,[ebx+12]
-    mov     [ebp+12],edx
-    mov     eax,[ebx+16]
-    mov     [ebp+16],eax
-    mov     eax,[ebx+20]
-    mov     [ebp+20],eax
-    add     ebp,24
-
-    f_key   0,24        ; 13 * 4 = 52 unsigned longs
-    f_key   1,24        ; 6 + 6 * 8 generated = 54
-    f_key   2,24
-    f_key   3,24
-    f_key   4,24
-    f_key   5,24
-    f_key   6,24
-    f_key   7,24
-
-    pop     edi
-    pop     esi
-    pop     ebx
-    pop     ebp
-    xor     eax,eax
-    do_exit  8
-
-%endif
-
-%ifdef  AES_256
-
-%ifndef ENCRYPTION_TABLE
-; %define ENCRYPTION_TABLE
-%endif
-
-%assign rc_val  1
-
-    do_name _aes_encrypt_key256,8
-
-	mov		ax, sp
-	movzx	esp, ax
-	
-    push    ebp
-    push    ebx
-    push    esi
-    push    edi
-
-    movzx   ebp, word [esp+20] ; ks
-    mov     [ebp+4*KS_LENGTH],dword 14 * 16
-    movzx   ebx, word [esp+18] ; key
-
-    mov     esi,[ebx]
-    mov     [ebp],esi
-    mov     edi,[ebx+4]
-    mov     [ebp+4],edi
-    mov     ecx,[ebx+8]
-    mov     [ebp+8],ecx
-    mov     edx,[ebx+12]
-    mov     [ebp+12],edx
-    mov     eax,[ebx+16]
-    mov     [ebp+16],eax
-    mov     eax,[ebx+20]
-    mov     [ebp+20],eax
-    mov     eax,[ebx+24]
-    mov     [ebp+24],eax
-    mov     eax,[ebx+28]
-    mov     [ebp+28],eax
-    add     ebp,32
-
-    f_key   0,32        ; 15 * 4 = 60 unsigned longs
-    f_key   1,32        ; 8 + 8 * 7 generated = 64
-    f_key   2,32
-    f_key   3,32
-    f_key   4,32
-    f_key   5,32
-    f_key   6,32
-
-    pop     edi
-    pop     esi
-    pop     ebx
-    pop     ebp
-    xor     eax,eax
-    do_exit  8
-
-%endif
-
-%ifdef  AES_VAR
-
-%ifndef ENCRYPTION_TABLE
-; %define ENCRYPTION_TABLE
-%endif
-
-    do_name _aes_encrypt_key,12
-
-    mov     ecx,[esp+4]
-    mov     eax,[esp+8]
-    mov     edx,[esp+12]
-    push    edx
-    push    ecx
-
-    cmp     eax,16
-    je      .1
-    cmp     eax,128
-    je      .1
-
-    cmp     eax,24
-    je      .2
-    cmp     eax,192
-    je      .2
-
-    cmp     eax,32
-    je      .3
-    cmp     eax,256
-    je      .3
-    mov     eax,-1
-    add     esp,8
-    do_exit 12
-
-.1: do_call _aes_encrypt_key128,8
-    do_exit 12
-.2: do_call _aes_encrypt_key192,8
-    do_exit 12
-.3: do_call _aes_encrypt_key256,8
-    do_exit 12
-
-%endif
-
-%endif
-
-%ifdef ENCRYPTION_TABLE
-
-; S-box data - 256 entries
-
-    section _DATA
-
-%define u8(x)   0, x, x, f3(x), f2(x), x, x, f3(x)
-
-_aes_enc_tab:
-    db  u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5)
-    db  u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76)
-    db  u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0)
-    db  u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0)
-    db  u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc)
-    db  u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15)
-    db  u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a)
-    db  u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75)
-    db  u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0)
-    db  u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84)
-    db  u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b)
-    db  u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf)
-    db  u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85)
-    db  u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8)
-    db  u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5)
-    db  u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2)
-    db  u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17)
-    db  u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73)
-    db  u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88)
-    db  u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb)
-    db  u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c)
-    db  u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79)
-    db  u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9)
-    db  u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08)
-    db  u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6)
-    db  u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a)
-    db  u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e)
-    db  u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e)
-    db  u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94)
-    db  u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf)
-    db  u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68)
-    db  u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16)
-
-%endif
-
-%ifdef  DECRYPTION
-
-; %define DECRYPTION_TABLE
-
-%define dtab_0(x)   [_aes_dec_tab+  8*x]
-%define dtab_1(x)   [_aes_dec_tab+3+8*x]
-%define dtab_2(x)   [_aes_dec_tab+2+8*x]
-%define dtab_3(x)   [_aes_dec_tab+1+8*x]
-%define dtab_x(x)   byte [_aes_dec_tab+7+8*x]
-
-%macro irn_fun 2
-
-    rol eax,16
-    %1      esi, cl, 0, ebp
-    %1      esi, bh, 1, ebp
-    %1      esi, al, 2, ebp
-    %1      edi, dl, 0, ebp
-    %1      edi, ch, 1, ebp
-    %1      edi, ah, 3, ebp
-    %2      ebp, bl, 0, ebp
-    shr     eax,16
-    and     ebx,0xffff0000
-    or      ebx,eax
-    shr     ecx,16
-    %1      ebp, bh, 1, eax
-    %1      ebp, ch, 3, eax
-    %2      eax, cl, 2, ecx
-    %1      eax, bl, 0, ecx
-    %1      eax, dh, 1, ecx
-    shr     ebx,16
-    shr     edx,16
-    %1      esi, dh, 3, ecx
-    %1      ebp, dl, 2, ecx
-    %1      eax, bh, 3, ecx
-    %1      edi, bl, 2, ecx
-
-%endmacro
-
-; Basic MOV and XOR Operations for normal rounds
-
-%macro  ni_xor  4
-    movzx   %4,%2
-    xor     %1,dtab_%3(%4)
-%endmacro
-
-%macro  ni_mov  4
-    movzx   %4,%2
-    mov     %1,dtab_%3(%4)
-%endmacro
-
-; Basic MOV and XOR Operations for last round
-
-%macro  li_xor  4
-    movzx   %4,%2
-    movzx   %4,dtab_x(%4)
-%if %3 != 0
-    shl     %4,8*%3
-%endif
-    xor     %1,%4
-%endmacro
-
-%macro  li_mov  4
-    movzx   %4,%2
-    movzx   %1,dtab_x(%4)
-%if %3 != 0
-    shl     %1,8*%3
-%endif
-%endmacro
-
-%ifdef REDUCE_CODE_SIZE
-
-dec_round:
-	sub		sp, 2
-%ifdef AES_REV_DKS
-    add     ebp,16
-%else
-    sub     ebp,16
-%endif
-    save    1,ebp
-    mov     esi,[ebp+8]
-    mov     edi,[ebp+12]
-
-    irn_fun ni_xor, ni_mov
-
-    mov     ebx,ebp
-    mov     ecx,esi
-    mov     edx,edi
-    restore ebp,1
-    xor     eax,[ebp]
-    xor     ebx,[ebp+4]
-   	add		sp, 2
-    ret
-
-%else
-
-%macro dec_round 0
-
-%ifdef AES_REV_DKS
-    add     ebp,16
-%else
-    sub     ebp,16
-%endif
-    save    0,ebp
-    mov     esi,[ebp+8]
-    mov     edi,[ebp+12]
-
-    irn_fun ni_xor, ni_mov
-
-    mov     ebx,ebp
-    mov     ecx,esi
-    mov     edx,edi
-    restore ebp,0
-    xor     eax,[ebp]
-    xor     ebx,[ebp+4]
-
-%endmacro
-
-%endif
-
-%macro dec_last_round 0
-
-%ifdef AES_REV_DKS
-    add     ebp,16
-%else
-    sub     ebp,16
-%endif
-    save    0,ebp
-    mov     esi,[ebp+8]
-    mov     edi,[ebp+12]
-
-    irn_fun li_xor, li_mov
-
-    mov     ebx,ebp
-    restore ebp,0
-    xor     eax,[ebp]
-    xor     ebx,[ebp+4]
-
-%endmacro
-
-    section _TEXT
-
-; AES Decryption Subroutine
-
-    do_name _aes_decrypt,12
-    
-	mov		ax, sp
-	movzx	esp, ax
-
-    sub     esp,stk_spc
-    mov     [esp+16],ebp
-    mov     [esp+12],ebx
-    mov     [esp+ 8],esi
-    mov     [esp+ 4],edi
-
-; input four columns and xor in first round key
-
-    movzx   esi,word [esp+in_blk+stk_spc] ; input pointer
-    mov     eax,[esi   ]
-    mov     ebx,[esi+ 4]
-    mov     ecx,[esi+ 8]
-    mov     edx,[esi+12]
-    lea     esi,[esi+16]
-
-    movzx   ebp, word [esp+ctx+stk_spc]    ; key pointer
-    movzx   edi,byte[ebp+4*KS_LENGTH]
-%ifndef  AES_REV_DKS        ; if decryption key schedule is not reversed
-    lea     ebp,[ebp+edi] ; we have to access it from the top down
-%endif
-    xor     eax,[ebp   ]  ; key schedule
-    xor     ebx,[ebp+ 4]
-    xor     ecx,[ebp+ 8]
-    xor     edx,[ebp+12]
-
-; determine the number of rounds
-
-%ifndef AES_256
-    cmp     edi,10*16
-    je      .3
-    cmp     edi,12*16
-    je      .2
-    cmp     edi,14*16
-    je      .1
-    mov     eax,-1
-    jmp     .5
-%endif
-
-.1: mf_call dec_round
-    mf_call dec_round
-.2: mf_call dec_round
-    mf_call dec_round
-.3: mf_call dec_round
-    mf_call dec_round
-    mf_call dec_round
-    mf_call dec_round
-    mf_call dec_round
-    mf_call dec_round
-    mf_call dec_round
-    mf_call dec_round
-    mf_call dec_round
-    dec_last_round
-
-; move final values to the output array.
-
-    movzx   ebp,word [esp+out_blk+stk_spc]
-    mov     [ebp],eax
-    mov     [ebp+4],ebx
-    mov     [ebp+8],esi
-    mov     [ebp+12],edi
-    xor     eax,eax
-
-.5: mov     ebp,[esp+16]
-    mov     ebx,[esp+12]
-    mov     esi,[esp+ 8]
-    mov     edi,[esp+ 4]
-    add     esp,stk_spc
-    do_exit 12
-
-%endif
-
-%ifdef REDUCE_CODE_SIZE
-
-inv_mix_col:
-    movzx   ecx,dl          ; input  eax, edx
-    movzx   ecx,etab_b(ecx) ; output eax
-    mov     eax,dtab_0(ecx) ; used   ecx
-    movzx   ecx,dh
-    shr     edx,16
-    movzx   ecx,etab_b(ecx)
-    xor     eax,dtab_1(ecx)
-    movzx   ecx,dl
-    movzx   ecx,etab_b(ecx)
-    xor     eax,dtab_2(ecx)
-    movzx   ecx,dh
-    movzx   ecx,etab_b(ecx)
-    xor     eax,dtab_3(ecx)
-    ret
-
-%else
-
-%macro  inv_mix_col 0   
-
-    movzx   ecx,dl          ; input  eax, edx
-    movzx   ecx,etab_b(ecx) ; output eax
-    mov     eax,dtab_0(ecx) ; used   ecx
-    movzx   ecx,dh
-    shr     edx,16
-    movzx   ecx,etab_b(ecx)
-    xor     eax,dtab_1(ecx)
-    movzx   ecx,dl
-    movzx   ecx,etab_b(ecx)
-    xor     eax,dtab_2(ecx)
-    movzx   ecx,dh
-    movzx   ecx,etab_b(ecx)
-    xor     eax,dtab_3(ecx)
-
-%endmacro
-
-%endif
-
-%ifdef DECRYPTION_KEY_SCHEDULE
-
-%ifdef AES_128
-
-%ifndef DECRYPTION_TABLE
-; %define DECRYPTION_TABLE
-%endif
-
-    do_name _aes_decrypt_key128,8
-
-    push    ebp
-    push    ebx
-    push    esi
-    push    edi
-    mov     eax,[esp+24]    ; context
-    mov     edx,[esp+20]    ; key
-    push    eax
-    push    edx
-    do_call _aes_encrypt_key128,8   ; generate expanded encryption key
-    mov     eax,10*16
-    mov     esi,[esp+24]    ; pointer to first round key
-    lea     edi,[esi+eax]   ; pointer to last round key
-    add     esi,32
-                            ; the inverse mix column transformation
-    mov     edx,[esi-16]    ; needs to be applied to all round keys
-    mf_call inv_mix_col     ; except first and last. Hence start by
-    mov     [esi-16],eax    ; transforming the four sub-keys in the
-    mov     edx,[esi-12]    ; second round key
-    mf_call inv_mix_col
-    mov     [esi-12],eax    ; transformations for subsequent rounds
-    mov     edx,[esi-8]     ; can then be made more efficient by
-    mf_call inv_mix_col     ; noting that for three of the four sub-keys
-    mov     [esi-8],eax     ; in the encryption round key ek[r]:
-    mov     edx,[esi-4]     ;
-    mf_call inv_mix_col     ;   ek[r][n] = ek[r][n-1] ^ ek[r-1][n]
-    mov     [esi-4],eax     ;
-                            ; where n is 1..3. Hence the corresponding
-.0: mov     edx,[esi]       ; subkeys in the decryption round key dk[r]
-    mf_call inv_mix_col     ; also obey since inv_mix_col is linear in
-    mov     [esi],eax       ; GF(256):
-    xor     eax,[esi-12]    ;
-    mov     [esi+4],eax     ;   dk[r][n] = dk[r][n-1] ^ dk[r-1][n]
-    xor     eax,[esi-8]     ;
-    mov     [esi+8],eax     ; So we only need one inverse mix column
-    xor     eax,[esi-4]     ; operation (n = 0) for each four word cycle
-    mov     [esi+12],eax    ; in the expanded key.
-    add     esi,16
-    cmp     edi,esi
-    jg      .0
-    jmp     dec_end
-
-%endif
-
-%ifdef AES_192
-
-%ifndef DECRYPTION_TABLE
-; %define DECRYPTION_TABLE
-%endif
-
-    do_name _aes_decrypt_key192,8
-
-    push    ebp
-    push    ebx
-    push    esi
-    push    edi
-    mov     eax,[esp+24]    ; context
-    mov     edx,[esp+20]    ; key
-    push    eax
-    push    edx
-    do_call _aes_encrypt_key192,8   ; generate expanded encryption key
-    mov     eax,12*16
-    mov     esi,[esp+24]    ; first round key
-    lea     edi,[esi+eax]   ; last round key
-    add     esi,48          ; the first 6 words are the key, of
-                            ; which the top 2 words are part of
-    mov     edx,[esi-32]    ; the second round key and hence
-    mf_call inv_mix_col     ; need to be modified. After this we
-    mov     [esi-32],eax    ; need to do a further six values prior
-    mov     edx,[esi-28]    ; to using a more efficient technique
-    mf_call inv_mix_col     ; based on:
-    mov     [esi-28],eax    ;
-                            ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n]
-    mov     edx,[esi-24]    ;
-    mf_call inv_mix_col     ; for n = 1 .. 5 where the key expansion
-    mov     [esi-24],eax    ; cycle is now 6 words long
-    mov     edx,[esi-20]
-    mf_call inv_mix_col
-    mov     [esi-20],eax
-    mov     edx,[esi-16]
-    mf_call inv_mix_col
-    mov     [esi-16],eax
-    mov     edx,[esi-12]
-    mf_call inv_mix_col
-    mov     [esi-12],eax
-    mov     edx,[esi-8]
-    mf_call inv_mix_col
-    mov     [esi-8],eax
-    mov     edx,[esi-4]
-    mf_call inv_mix_col
-    mov     [esi-4],eax
-
-.0: mov     edx,[esi]       ; the expanded key is 13 * 4 = 44 32-bit words
-    mf_call inv_mix_col     ; of which 11 * 4 = 44 have to be modified
-    mov     [esi],eax       ; using inv_mix_col.  We have already done 8
-    xor     eax,[esi-20]    ; of these so 36 are left - hence we need
-    mov     [esi+4],eax     ; exactly 6 loops of six here
-    xor     eax,[esi-16]
-    mov     [esi+8],eax
-    xor     eax,[esi-12]
-    mov     [esi+12],eax
-    xor     eax,[esi-8]
-    mov     [esi+16],eax
-    xor     eax,[esi-4]
-    mov     [esi+20],eax
-    add     esi,24
-    cmp     edi,esi
-    jg      .0
-    jmp     dec_end
-
-%endif
-
-%ifdef AES_256
-
-%ifndef DECRYPTION_TABLE
-; %define DECRYPTION_TABLE
-%endif
-
-    do_name _aes_decrypt_key256,8
-    
-    mov		ax, sp
-	movzx	esp, ax
-    push    ebp
-    push    ebx
-    push    esi
-    push    edi
-    
-    movzx   eax, word [esp+20] ; ks
-    movzx   edx, word [esp+18] ; key
-    push    ax
-    push    dx
-    do_call _aes_encrypt_key256,4   ; generate expanded encryption key
-    mov     eax,14*16
-    movzx   esi, word [esp+20] ; ks
-    lea     edi,[esi+eax]
-    add     esi,64
-
-    mov     edx,[esi-48]    ; the primary key is 8 words, of which
-    mf_call inv_mix_col     ; the top four require modification
-    mov     [esi-48],eax
-    mov     edx,[esi-44]
-    mf_call inv_mix_col
-    mov     [esi-44],eax
-    mov     edx,[esi-40]
-    mf_call inv_mix_col
-    mov     [esi-40],eax
-    mov     edx,[esi-36]
-    mf_call inv_mix_col
-    mov     [esi-36],eax
-
-    mov     edx,[esi-32]    ; the encryption key expansion cycle is
-    mf_call inv_mix_col     ; now eight words long so we need to
-    mov     [esi-32],eax    ; start by doing one complete block
-    mov     edx,[esi-28]
-    mf_call inv_mix_col
-    mov     [esi-28],eax
-    mov     edx,[esi-24]
-    mf_call inv_mix_col
-    mov     [esi-24],eax
-    mov     edx,[esi-20]
-    mf_call inv_mix_col
-    mov     [esi-20],eax
-    mov     edx,[esi-16]
-    mf_call inv_mix_col
-    mov     [esi-16],eax
-    mov     edx,[esi-12]
-    mf_call inv_mix_col
-    mov     [esi-12],eax
-    mov     edx,[esi-8]
-    mf_call inv_mix_col
-    mov     [esi-8],eax
-    mov     edx,[esi-4]
-    mf_call inv_mix_col
-    mov     [esi-4],eax
-
-.0: mov     edx,[esi]       ; we can now speed up the remaining
-    mf_call inv_mix_col     ; rounds by using the technique
-    mov     [esi],eax       ; outlined earlier.  But note that
-    xor     eax,[esi-28]    ; there is one extra inverse mix
-    mov     [esi+4],eax     ; column operation as the 256 bit
-    xor     eax,[esi-24]    ; key has an extra non-linear step
-    mov     [esi+8],eax     ; for the midway element.
-    xor     eax,[esi-20]
-    mov     [esi+12],eax    ; the expanded key is 15 * 4 = 60
-    mov     edx,[esi+16]    ; 32-bit words of which 52 need to
-    mf_call inv_mix_col     ; be modified.  We have already done
-    mov     [esi+16],eax    ; 12 so 40 are left - which means
-    xor     eax,[esi-12]    ; that we need exactly 5 loops of 8
-    mov     [esi+20],eax
-    xor     eax,[esi-8]
-    mov     [esi+24],eax
-    xor     eax,[esi-4]
-    mov     [esi+28],eax
-    add     esi,32
-    cmp     edi,esi
-    jg      .0
-
-%endif
-
-dec_end:
-
-%ifdef AES_REV_DKS
-
-    movzx   esi,word [esp+20]	; this reverses the order of the
-.1: mov     eax,[esi]			; round keys if required
-    mov     ebx,[esi+4]
-    mov     ebp,[edi]
-    mov     edx,[edi+4]
-    mov     [esi],ebp
-    mov     [esi+4],edx
-    mov     [edi],eax
-    mov     [edi+4],ebx
-
-    mov     eax,[esi+8]
-    mov     ebx,[esi+12]
-    mov     ebp,[edi+8]
-    mov     edx,[edi+12]
-    mov     [esi+8],ebp
-    mov     [esi+12],edx
-    mov     [edi+8],eax
-    mov     [edi+12],ebx
-
-    add     esi,16
-    sub     edi,16
-    cmp     edi,esi
-    jg      .1
-
-%endif
-
-    pop     edi
-    pop     esi
-    pop     ebx
-    pop     ebp
-    xor     eax,eax
-    do_exit  8
-
-%ifdef AES_VAR
-
-    do_name _aes_decrypt_key,12
-
-    mov     ecx,[esp+4]
-    mov     eax,[esp+8]
-    mov     edx,[esp+12]
-    push    edx
-    push    ecx
-
-    cmp     eax,16
-    je      .1
-    cmp     eax,128
-    je      .1
-
-    cmp     eax,24
-    je      .2
-    cmp     eax,192
-    je      .2
-
-    cmp     eax,32
-    je      .3
-    cmp     eax,256
-    je      .3
-    mov     eax,-1
-    add     esp,8
-    do_exit 12
-
-.1: do_call _aes_decrypt_key128,8
-    do_exit 12
-.2: do_call _aes_decrypt_key192,8
-    do_exit 12
-.3: do_call _aes_decrypt_key256,8
-    do_exit 12
-
-%endif
-
-%endif
-
-%ifdef DECRYPTION_TABLE
-
-; Inverse S-box data - 256 entries
-
-    section _DATA
-
-%define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
-
-_aes_dec_tab:
-    db  v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38)
-    db  v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb)
-    db  v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87)
-    db  v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb)
-    db  v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d)
-    db  v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e)
-    db  v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2)
-    db  v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25)
-    db  v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16)
-    db  v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92)
-    db  v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda)
-    db  v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84)
-    db  v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a)
-    db  v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06)
-    db  v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02)
-    db  v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b)
-    db  v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea)
-    db  v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73)
-    db  v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85)
-    db  v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e)
-    db  v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89)
-    db  v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b)
-    db  v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20)
-    db  v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4)
-    db  v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31)
-    db  v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f)
-    db  v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d)
-    db  v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef)
-    db  v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0)
-    db  v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61)
-    db  v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26)
-    db  v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d)
-
-%endif
+
+; ---------------------------------------------------------------------------
+; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+; 
+; LICENSE TERMS
+; 
+; The free distribution and use of this software is allowed (with or without
+; changes) provided that:
+; 
+;  1. source code distributions include the above copyright notice, this
+;     list of conditions and the following disclaimer;
+; 
+;  2. binary distributions include the above copyright notice, this list
+;     of conditions and the following disclaimer in their documentation;
+; 
+;  3. the name of the copyright holder is not used to endorse products
+;     built using this software without specific written permission.
+; 
+; DISCLAIMER
+; 
+; This software is provided 'as is' with no explicit or implied warranties
+; in respect of its properties, including, but not limited to, correctness
+; and/or fitness for purpose.
+; ---------------------------------------------------------------------------
+; Issue 20/12/2007
+;
+; This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h
+; and the same define to be set here as well. If AES_V2C is set this file
+; requires the C files aeskey.c and aestab.c for support.
+
+; An AES implementation for x86 processors using the YASM (or NASM) assembler.
+; This is a full assembler implementation covering encryption, decryption and
+; key scheduling. It uses 2k bytes of tables but its encryption and decryption
+; performance is very close to that obtained using large tables.  Key schedule
+; expansion is slower for both encryption and decryption but this is likely to
+; be offset by the much smaller load that this version places on the processor
+; cache. I acknowledge the contribution made by Daniel Bernstein to aspects of
+; the design of the AES round function used here.
+;
+; This code provides the standard AES block size (128 bits, 16 bytes) and the
+; three standard AES key sizes (128, 192 and 256 bits). It has the same call
+; interface as my C implementation. The ebx, esi, edi and ebp registers are
+; preserved across calls but eax, ecx and edx and the artihmetic status flags
+; are not.  Although this is a full assembler implementation, it can be used
+; in conjunction with my C code which provides faster key scheduling using
+; large tables. In this case aeskey.c should be compiled with ASM_X86_V2C
+; defined.  It is also important that the defines below match those used in the
+; C code.  This code uses the VC++ register saving conentions; if it is used
+; with another compiler, conventions for using and saving registers may need
+; to be checked (and calling conventions).  The YASM command line for the VC++
+; custom build step is:
+;
+;    yasm -Xvc -f win32 -D <Z> -o "$(TargetDir)\$(InputName).obj" "$(InputPath)"
+;
+; For the cryptlib build this is (pcg):
+;
+;	yasm -Xvc -f win32 -D ASM_X86_V2C -o aescrypt2.obj aes_x86_v2.asm
+;
+; where <Z> is ASM_X86_V2 or ASM_X86_V2C.  The calling intefaces are:
+;
+;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
+;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
+;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
+;                                            const aes_encrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
+;                                            const aes_decrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_encrypt_key(const unsigned char key[],
+;                           unsigned int len, const aes_decrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_decrypt_key(const unsigned char key[],
+;                           unsigned int len, const aes_decrypt_ctx cx[1]);
+;
+; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
+; either bits or bytes.
+
+; The DLL interface must use the _stdcall convention in which the number
+; of bytes of parameter space is added after an @ to the sutine's name.
+; We must also remove our parameters from the stack before return (see
+; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
+
+;
+; Adapted for TrueCrypt:
+; - All tables generated at run-time
+; - Adapted for 16-bit environment
+;
+
+CPU 386
+USE16
+SEGMENT _TEXT PUBLIC CLASS=CODE USE16
+SEGMENT _DATA PUBLIC CLASS=DATA USE16
+
+GROUP DGROUP _TEXT _DATA
+
+extern _aes_dec_tab		; Aestab.c
+extern _aes_enc_tab
+
+; %define DLL_EXPORT
+
+; The size of the code can be reduced by using functions for the encryption
+; and decryption rounds in place of macro expansion
+
+%define REDUCE_CODE_SIZE
+
+; Comment in/out the following lines to obtain the desired subroutines. These
+; selections MUST match those in the C header file aes.h
+
+; %define AES_128                 ; define if AES with 128 bit keys is needed
+; %define AES_192                 ; define if AES with 192 bit keys is needed
+%define AES_256                 ; define if AES with 256 bit keys is needed
+; %define AES_VAR                 ; define if a variable key size is needed
+%define ENCRYPTION              ; define if encryption is needed
+%define DECRYPTION              ; define if decryption is needed
+; %define AES_REV_DKS             ; define if key decryption schedule is reversed
+
+%ifndef ASM_X86_V2C
+%define ENCRYPTION_KEY_SCHEDULE ; define if encryption key expansion is needed
+%define DECRYPTION_KEY_SCHEDULE ; define if decryption key expansion is needed
+%endif
+
+; The encryption key schedule has the following in memory layout where N is the
+; number of rounds (10, 12 or 14):
+;
+; lo: | input key (round 0)  |  ; each round is four 32-bit words
+;     | encryption round 1   |
+;     | encryption round 2   |
+;     ....
+;     | encryption round N-1 |
+; hi: | encryption round N   |
+;
+; The decryption key schedule is normally set up so that it has the same
+; layout as above by actually reversing the order of the encryption key
+; schedule in memory (this happens when AES_REV_DKS is set):
+;
+; lo: | decryption round 0   | =              | encryption round N   |
+;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
+;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
+;     ....                       ....
+;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
+; hi: | decryption round N   | =              | input key (round 0)  |
+;
+; with rounds except the first and last modified using inv_mix_column()
+; But if AES_REV_DKS is NOT set the order of keys is left as it is for
+; encryption so that it has to be accessed in reverse when used for
+; decryption (although the inverse mix column modifications are done)
+;
+; lo: | decryption round 0   | =              | input key (round 0)  |
+;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
+;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
+;     ....                       ....
+;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
+; hi: | decryption round N   | =              | encryption round N   |
+;
+; This layout is faster when the assembler key scheduling provided here
+; is used.
+;
+; End of user defines
+
+%ifdef AES_VAR
+%ifndef AES_128
+%define AES_128
+%endif
+%ifndef AES_192
+%define AES_192
+%endif
+%ifndef AES_256
+%define AES_256
+%endif
+%endif
+
+%ifdef AES_VAR
+%define KS_LENGTH       60
+%elifdef AES_256
+%define KS_LENGTH       60
+%elifdef AES_192
+%define KS_LENGTH       52
+%else
+%define KS_LENGTH       44
+%endif
+
+; These macros implement stack based local variables
+
+%macro  save 2
+    mov     [esp+4*%1],%2
+%endmacro
+
+%macro  restore 2
+    mov     %1,[esp+4*%2]
+%endmacro
+
+%ifdef  REDUCE_CODE_SIZE
+    %macro mf_call 1
+        call %1
+    %endmacro
+%else
+    %macro mf_call 1
+        %1
+    %endmacro
+%endif
+
+; the DLL has to implement the _stdcall calling interface on return
+; In this case we have to take our parameters (3 4-byte pointers)
+; off the stack
+
+%define parms 12
+
+%macro  do_name 1-2 parms
+%ifndef DLL_EXPORT
+    global  %1
+%1:
+%else
+    global  %1@%2
+    export  %1@%2
+%1@%2:
+%endif
+%endmacro
+
+%macro  do_call 1-2 parms
+%ifndef DLL_EXPORT
+    call    %1
+    add     esp,%2
+%else
+    call    %1@%2
+%endif
+%endmacro
+
+%macro  do_exit  0-1 parms
+%ifdef DLL_EXPORT
+    ret %1
+%else
+    ret
+%endif
+%endmacro
+
+; finite field multiplies by {02}, {04} and {08}
+
+%define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
+%define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
+%define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
+
+; finite field multiplies required in table generation
+
+%define f3(x)   (f2(x) ^ x)
+%define f9(x)   (f8(x) ^ x)
+%define fb(x)   (f8(x) ^ f2(x) ^ x)
+%define fd(x)   (f8(x) ^ f4(x) ^ x)
+%define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
+
+%define etab_0(x)   [_aes_enc_tab+4+8*x]
+%define etab_1(x)   [_aes_enc_tab+3+8*x]
+%define etab_2(x)   [_aes_enc_tab+2+8*x]
+%define etab_3(x)   [_aes_enc_tab+1+8*x]
+%define etab_b(x)   byte [_aes_enc_tab+1+8*x] ; used with movzx for 0x000000xx
+%define etab_w(x)   word [_aes_enc_tab+8*x]   ; used with movzx for 0x0000xx00
+
+%define btab_0(x)   [_aes_enc_tab+6+8*x]
+%define btab_1(x)   [_aes_enc_tab+5+8*x]
+%define btab_2(x)   [_aes_enc_tab+4+8*x]
+%define btab_3(x)   [_aes_enc_tab+3+8*x]
+
+; ROUND FUNCTION.  Build column[2] on ESI and column[3] on EDI that have the
+; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX.
+;
+; Input:
+;
+;   EAX     column[0]
+;   EBX     column[1]
+;   ECX     column[2]
+;   EDX     column[3]
+;   ESI     column key[round][2]
+;   EDI     column key[round][3]
+;   EBP     scratch
+;
+; Output:
+;
+;   EBP     column[0]   unkeyed
+;   EBX     column[1]   unkeyed
+;   ESI     column[2]   keyed
+;   EDI     column[3]   keyed
+;   EAX     scratch
+;   ECX     scratch
+;   EDX     scratch
+
+%macro rnd_fun 2
+
+    rol     ebx,16
+    %1      esi, cl, 0, ebp
+    %1      esi, dh, 1, ebp
+    %1      esi, bh, 3, ebp
+    %1      edi, dl, 0, ebp
+    %1      edi, ah, 1, ebp
+    %1      edi, bl, 2, ebp
+    %2      ebp, al, 0, ebp
+    shr     ebx,16
+    and     eax,0xffff0000
+    or      eax,ebx
+    shr     edx,16
+    %1      ebp, ah, 1, ebx
+    %1      ebp, dh, 3, ebx
+    %2      ebx, dl, 2, ebx
+    %1      ebx, ch, 1, edx
+    %1      ebx, al, 0, edx
+    shr     eax,16
+    shr     ecx,16
+    %1      ebp, cl, 2, edx
+    %1      edi, ch, 3, edx
+    %1      esi, al, 2, edx
+    %1      ebx, ah, 3, edx
+
+%endmacro
+
+; Basic MOV and XOR Operations for normal rounds
+
+%macro  nr_xor  4
+    movzx   %4,%2
+    xor     %1,etab_%3(%4)
+%endmacro
+
+%macro  nr_mov  4
+    movzx   %4,%2
+    mov     %1,etab_%3(%4)
+%endmacro
+
+; Basic MOV and XOR Operations for last round
+
+%if 1
+
+    %macro  lr_xor  4
+        movzx   %4,%2
+        movzx   %4,etab_b(%4)
+    %if %3 != 0
+        shl     %4,8*%3
+    %endif
+        xor     %1,%4
+    %endmacro
+
+    %macro  lr_mov  4
+        movzx   %4,%2
+        movzx   %1,etab_b(%4)
+    %if %3 != 0
+        shl     %1,8*%3
+    %endif
+    %endmacro
+
+%else       ; less effective but worth leaving as an option
+
+    %macro  lr_xor  4
+        movzx   %4,%2
+        mov     %4,btab_%3(%4)
+        and     %4,0x000000ff << 8 * %3
+        xor     %1,%4
+    %endmacro
+
+    %macro  lr_mov  4
+        movzx   %4,%2
+        mov     %1,btab_%3(%4)
+        and     %1,0x000000ff << 8 * %3
+    %endmacro
+
+%endif
+
+; Apply S-Box to the 4 bytes in a 32-bit word and rotate byte positions
+
+%ifdef REDUCE_CODE_SIZE
+    
+l3s_col:
+    movzx   ecx,al              ; in      eax
+    movzx   ecx, etab_b(ecx)    ; out     eax
+    xor     edx,ecx             ; scratch ecx,edx
+    movzx   ecx,ah
+    movzx   ecx, etab_b(ecx)
+    shl     ecx,8
+    xor     edx,ecx
+    shr     eax,16
+    movzx   ecx,al
+    movzx   ecx, etab_b(ecx)
+    shl     ecx,16
+    xor     edx,ecx
+    movzx   ecx,ah
+    movzx   ecx, etab_b(ecx)
+    shl     ecx,24
+    xor     edx,ecx
+    mov     eax,edx
+    ret
+
+%else
+
+%macro l3s_col 0
+
+    movzx   ecx,al              ; in      eax
+    movzx   ecx, etab_b(ecx)    ; out     eax
+    xor     edx,ecx             ; scratch ecx,edx
+    movzx   ecx,ah
+    movzx   ecx, etab_b(ecx)
+    shl     ecx,8
+    xor     edx,ecx
+    shr     eax,16
+    movzx   ecx,al
+    movzx   ecx, etab_b(ecx)
+    shl     ecx,16
+    xor     edx,ecx
+    movzx   ecx,ah
+    movzx   ecx, etab_b(ecx)
+    shl     ecx,24
+    xor     edx,ecx
+    mov     eax,edx
+
+%endmacro
+
+%endif
+    
+; offsets to parameters
+
+in_blk  equ     2   ; input byte array address parameter
+out_blk equ     4   ; output byte array address parameter
+ctx     equ     6   ; AES context structure
+stk_spc equ    20   ; stack space
+
+%ifdef  ENCRYPTION
+
+; %define ENCRYPTION_TABLE
+
+%ifdef REDUCE_CODE_SIZE
+
+enc_round:
+	sub		sp, 2
+    add     ebp,16
+    save    1,ebp
+    mov     esi,[ebp+8]
+    mov     edi,[ebp+12]
+
+    rnd_fun nr_xor, nr_mov
+
+    mov     eax,ebp
+    mov     ecx,esi
+    mov     edx,edi
+    restore ebp,1
+    xor     eax,[ebp]
+    xor     ebx,[ebp+4]
+	add		sp, 2
+    ret
+    
+%else
+
+%macro enc_round 0
+
+    add     ebp,16
+    save    0,ebp
+    mov     esi,[ebp+8]
+    mov     edi,[ebp+12]
+
+    rnd_fun nr_xor, nr_mov
+
+    mov     eax,ebp
+    mov     ecx,esi
+    mov     edx,edi
+    restore ebp,0
+    xor     eax,[ebp]
+    xor     ebx,[ebp+4]
+
+%endmacro
+
+%endif
+
+%macro enc_last_round 0
+
+    add     ebp,16
+    save    0,ebp
+    mov     esi,[ebp+8]
+    mov     edi,[ebp+12]
+
+    rnd_fun lr_xor, lr_mov
+
+    mov     eax,ebp
+    restore ebp,0
+    xor     eax,[ebp]
+    xor     ebx,[ebp+4]
+
+%endmacro
+
+    section _TEXT
+
+; AES Encryption Subroutine
+
+    do_name _aes_encrypt,12
+
+	mov		ax, sp
+	movzx	esp, ax
+
+    sub     esp,stk_spc
+    mov     [esp+16],ebp
+    mov     [esp+12],ebx
+    mov     [esp+ 8],esi
+    mov     [esp+ 4],edi
+
+    movzx   esi,word [esp+in_blk+stk_spc] ; input pointer
+    mov     eax,[esi   ]
+    mov     ebx,[esi+ 4]
+    mov     ecx,[esi+ 8]
+    mov     edx,[esi+12]
+
+    movzx   ebp,word [esp+ctx+stk_spc]    ; key pointer
+    movzx   edi,byte [ebp+4*KS_LENGTH]
+    xor     eax,[ebp   ]
+    xor     ebx,[ebp+ 4]
+    xor     ecx,[ebp+ 8]
+    xor     edx,[ebp+12]
+
+; determine the number of rounds
+
+%ifndef AES_256
+    cmp     edi,10*16
+    je      .3
+    cmp     edi,12*16
+    je      .2
+    cmp     edi,14*16
+    je      .1
+    mov     eax,-1
+    jmp     .5
+%endif
+
+.1: mf_call enc_round
+    mf_call enc_round
+.2: mf_call enc_round
+    mf_call enc_round
+.3: mf_call enc_round
+    mf_call enc_round
+    mf_call enc_round
+    mf_call enc_round
+    mf_call enc_round
+    mf_call enc_round
+    mf_call enc_round
+    mf_call enc_round
+    mf_call enc_round
+    enc_last_round
+
+    movzx   edx,word [esp+out_blk+stk_spc]
+    mov     [edx],eax
+    mov     [edx+4],ebx
+    mov     [edx+8],esi
+    mov     [edx+12],edi
+    xor     eax,eax
+
+.5: mov     ebp,[esp+16]
+    mov     ebx,[esp+12]
+    mov     esi,[esp+ 8]
+    mov     edi,[esp+ 4]
+    add     esp,stk_spc
+    do_exit 12
+
+%endif
+
+%macro f_key 2
+
+    push    ecx
+    push    edx
+    mov     edx,esi
+    ror     eax,8
+    mf_call l3s_col
+    mov     esi,eax
+    pop     edx
+    pop     ecx
+    xor     esi,rc_val
+
+    mov     [ebp+%1*%2],esi
+    xor     edi,esi
+    mov     [ebp+%1*%2+4],edi
+    xor     ecx,edi
+    mov     [ebp+%1*%2+8],ecx
+    xor     edx,ecx
+    mov     [ebp+%1*%2+12],edx
+    mov     eax,edx
+
+%if %2 == 24
+
+%if %1 < 7
+    xor     eax,[ebp+%1*%2+16-%2]
+    mov     [ebp+%1*%2+16],eax
+    xor     eax,[ebp+%1*%2+20-%2]
+    mov     [ebp+%1*%2+20],eax
+%endif
+
+%elif %2 == 32
+
+%if %1 < 6
+    push    ecx
+    push    edx
+    mov     edx,[ebp+%1*%2+16-%2]
+    mf_call l3s_col
+    pop     edx
+    pop     ecx
+    mov     [ebp+%1*%2+16],eax
+    xor     eax,[ebp+%1*%2+20-%2]
+    mov     [ebp+%1*%2+20],eax
+    xor     eax,[ebp+%1*%2+24-%2]
+    mov     [ebp+%1*%2+24],eax
+    xor     eax,[ebp+%1*%2+28-%2]
+    mov     [ebp+%1*%2+28],eax
+%endif
+
+%endif
+
+%assign rc_val f2(rc_val)
+
+%endmacro
+
+%ifdef ENCRYPTION_KEY_SCHEDULE
+
+%ifdef  AES_128
+
+%ifndef ENCRYPTION_TABLE
+; %define ENCRYPTION_TABLE
+%endif
+
+%assign rc_val  1
+
+    do_name _aes_encrypt_key128,8
+
+    push    ebp
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     ebp,[esp+24]
+    mov     [ebp+4*KS_LENGTH],dword 10*16
+    mov     ebx,[esp+20]
+
+    mov     esi,[ebx]
+    mov     [ebp],esi
+    mov     edi,[ebx+4]
+    mov     [ebp+4],edi
+    mov     ecx,[ebx+8]
+    mov     [ebp+8],ecx
+    mov     edx,[ebx+12]
+    mov     [ebp+12],edx
+    add     ebp,16
+    mov     eax,edx
+
+    f_key   0,16        ; 11 * 4 = 44 unsigned longs
+    f_key   1,16        ; 4 + 4 * 10 generated = 44
+    f_key   2,16
+    f_key   3,16
+    f_key   4,16
+    f_key   5,16
+    f_key   6,16
+    f_key   7,16
+    f_key   8,16
+    f_key   9,16
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     ebp
+    xor     eax,eax
+    do_exit  8
+
+%endif
+
+%ifdef  AES_192
+
+%ifndef ENCRYPTION_TABLE
+; %define ENCRYPTION_TABLE
+%endif
+
+%assign rc_val  1
+
+    do_name _aes_encrypt_key192,8
+
+    push    ebp
+    push    ebx
+    push    esi
+    push    edi
+
+    mov     ebp,[esp+24]
+    mov     [ebp+4*KS_LENGTH],dword 12 * 16
+    mov     ebx,[esp+20]
+
+    mov     esi,[ebx]
+    mov     [ebp],esi
+    mov     edi,[ebx+4]
+    mov     [ebp+4],edi
+    mov     ecx,[ebx+8]
+    mov     [ebp+8],ecx
+    mov     edx,[ebx+12]
+    mov     [ebp+12],edx
+    mov     eax,[ebx+16]
+    mov     [ebp+16],eax
+    mov     eax,[ebx+20]
+    mov     [ebp+20],eax
+    add     ebp,24
+
+    f_key   0,24        ; 13 * 4 = 52 unsigned longs
+    f_key   1,24        ; 6 + 6 * 8 generated = 54
+    f_key   2,24
+    f_key   3,24
+    f_key   4,24
+    f_key   5,24
+    f_key   6,24
+    f_key   7,24
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     ebp
+    xor     eax,eax
+    do_exit  8
+
+%endif
+
+%ifdef  AES_256
+
+%ifndef ENCRYPTION_TABLE
+; %define ENCRYPTION_TABLE
+%endif
+
+%assign rc_val  1
+
+    do_name _aes_encrypt_key256,8
+
+	mov		ax, sp
+	movzx	esp, ax
+	
+    push    ebp
+    push    ebx
+    push    esi
+    push    edi
+
+    movzx   ebp, word [esp+20] ; ks
+    mov     [ebp+4*KS_LENGTH],dword 14 * 16
+    movzx   ebx, word [esp+18] ; key
+
+    mov     esi,[ebx]
+    mov     [ebp],esi
+    mov     edi,[ebx+4]
+    mov     [ebp+4],edi
+    mov     ecx,[ebx+8]
+    mov     [ebp+8],ecx
+    mov     edx,[ebx+12]
+    mov     [ebp+12],edx
+    mov     eax,[ebx+16]
+    mov     [ebp+16],eax
+    mov     eax,[ebx+20]
+    mov     [ebp+20],eax
+    mov     eax,[ebx+24]
+    mov     [ebp+24],eax
+    mov     eax,[ebx+28]
+    mov     [ebp+28],eax
+    add     ebp,32
+
+    f_key   0,32        ; 15 * 4 = 60 unsigned longs
+    f_key   1,32        ; 8 + 8 * 7 generated = 64
+    f_key   2,32
+    f_key   3,32
+    f_key   4,32
+    f_key   5,32
+    f_key   6,32
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     ebp
+    xor     eax,eax
+    do_exit  8
+
+%endif
+
+%ifdef  AES_VAR
+
+%ifndef ENCRYPTION_TABLE
+; %define ENCRYPTION_TABLE
+%endif
+
+    do_name _aes_encrypt_key,12
+
+    mov     ecx,[esp+4]
+    mov     eax,[esp+8]
+    mov     edx,[esp+12]
+    push    edx
+    push    ecx
+
+    cmp     eax,16
+    je      .1
+    cmp     eax,128
+    je      .1
+
+    cmp     eax,24
+    je      .2
+    cmp     eax,192
+    je      .2
+
+    cmp     eax,32
+    je      .3
+    cmp     eax,256
+    je      .3
+    mov     eax,-1
+    add     esp,8
+    do_exit 12
+
+.1: do_call _aes_encrypt_key128,8
+    do_exit 12
+.2: do_call _aes_encrypt_key192,8
+    do_exit 12
+.3: do_call _aes_encrypt_key256,8
+    do_exit 12
+
+%endif
+
+%endif
+
+%ifdef ENCRYPTION_TABLE
+
+; S-box data - 256 entries
+
+    section _DATA
+
+%define u8(x)   0, x, x, f3(x), f2(x), x, x, f3(x)
+
+_aes_enc_tab:
+    db  u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5)
+    db  u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76)
+    db  u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0)
+    db  u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0)
+    db  u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc)
+    db  u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15)
+    db  u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a)
+    db  u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75)
+    db  u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0)
+    db  u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84)
+    db  u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b)
+    db  u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf)
+    db  u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85)
+    db  u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8)
+    db  u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5)
+    db  u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2)
+    db  u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17)
+    db  u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73)
+    db  u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88)
+    db  u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb)
+    db  u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c)
+    db  u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79)
+    db  u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9)
+    db  u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08)
+    db  u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6)
+    db  u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a)
+    db  u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e)
+    db  u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e)
+    db  u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94)
+    db  u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf)
+    db  u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68)
+    db  u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16)
+
+%endif
+
+%ifdef  DECRYPTION
+
+; %define DECRYPTION_TABLE
+
+%define dtab_0(x)   [_aes_dec_tab+  8*x]
+%define dtab_1(x)   [_aes_dec_tab+3+8*x]
+%define dtab_2(x)   [_aes_dec_tab+2+8*x]
+%define dtab_3(x)   [_aes_dec_tab+1+8*x]
+%define dtab_x(x)   byte [_aes_dec_tab+7+8*x]
+
+%macro irn_fun 2
+
+    rol eax,16
+    %1      esi, cl, 0, ebp
+    %1      esi, bh, 1, ebp
+    %1      esi, al, 2, ebp
+    %1      edi, dl, 0, ebp
+    %1      edi, ch, 1, ebp
+    %1      edi, ah, 3, ebp
+    %2      ebp, bl, 0, ebp
+    shr     eax,16
+    and     ebx,0xffff0000
+    or      ebx,eax
+    shr     ecx,16
+    %1      ebp, bh, 1, eax
+    %1      ebp, ch, 3, eax
+    %2      eax, cl, 2, ecx
+    %1      eax, bl, 0, ecx
+    %1      eax, dh, 1, ecx
+    shr     ebx,16
+    shr     edx,16
+    %1      esi, dh, 3, ecx
+    %1      ebp, dl, 2, ecx
+    %1      eax, bh, 3, ecx
+    %1      edi, bl, 2, ecx
+
+%endmacro
+
+; Basic MOV and XOR Operations for normal rounds
+
+%macro  ni_xor  4
+    movzx   %4,%2
+    xor     %1,dtab_%3(%4)
+%endmacro
+
+%macro  ni_mov  4
+    movzx   %4,%2
+    mov     %1,dtab_%3(%4)
+%endmacro
+
+; Basic MOV and XOR Operations for last round
+
+%macro  li_xor  4
+    movzx   %4,%2
+    movzx   %4,dtab_x(%4)
+%if %3 != 0
+    shl     %4,8*%3
+%endif
+    xor     %1,%4
+%endmacro
+
+%macro  li_mov  4
+    movzx   %4,%2
+    movzx   %1,dtab_x(%4)
+%if %3 != 0
+    shl     %1,8*%3
+%endif
+%endmacro
+
+%ifdef REDUCE_CODE_SIZE
+
+dec_round:
+	sub		sp, 2
+%ifdef AES_REV_DKS
+    add     ebp,16
+%else
+    sub     ebp,16
+%endif
+    save    1,ebp
+    mov     esi,[ebp+8]
+    mov     edi,[ebp+12]
+
+    irn_fun ni_xor, ni_mov
+
+    mov     ebx,ebp
+    mov     ecx,esi
+    mov     edx,edi
+    restore ebp,1
+    xor     eax,[ebp]
+    xor     ebx,[ebp+4]
+   	add		sp, 2
+    ret
+
+%else
+
+%macro dec_round 0
+
+%ifdef AES_REV_DKS
+    add     ebp,16
+%else
+    sub     ebp,16
+%endif
+    save    0,ebp
+    mov     esi,[ebp+8]
+    mov     edi,[ebp+12]
+
+    irn_fun ni_xor, ni_mov
+
+    mov     ebx,ebp
+    mov     ecx,esi
+    mov     edx,edi
+    restore ebp,0
+    xor     eax,[ebp]
+    xor     ebx,[ebp+4]
+
+%endmacro
+
+%endif
+
+%macro dec_last_round 0
+
+%ifdef AES_REV_DKS
+    add     ebp,16
+%else
+    sub     ebp,16
+%endif
+    save    0,ebp
+    mov     esi,[ebp+8]
+    mov     edi,[ebp+12]
+
+    irn_fun li_xor, li_mov
+
+    mov     ebx,ebp
+    restore ebp,0
+    xor     eax,[ebp]
+    xor     ebx,[ebp+4]
+
+%endmacro
+
+    section _TEXT
+
+; AES Decryption Subroutine
+
+    do_name _aes_decrypt,12
+    
+	mov		ax, sp
+	movzx	esp, ax
+
+    sub     esp,stk_spc
+    mov     [esp+16],ebp
+    mov     [esp+12],ebx
+    mov     [esp+ 8],esi
+    mov     [esp+ 4],edi
+
+; input four columns and xor in first round key
+
+    movzx   esi,word [esp+in_blk+stk_spc] ; input pointer
+    mov     eax,[esi   ]
+    mov     ebx,[esi+ 4]
+    mov     ecx,[esi+ 8]
+    mov     edx,[esi+12]
+    lea     esi,[esi+16]
+
+    movzx   ebp, word [esp+ctx+stk_spc]    ; key pointer
+    movzx   edi,byte[ebp+4*KS_LENGTH]
+%ifndef  AES_REV_DKS        ; if decryption key schedule is not reversed
+    lea     ebp,[ebp+edi] ; we have to access it from the top down
+%endif
+    xor     eax,[ebp   ]  ; key schedule
+    xor     ebx,[ebp+ 4]
+    xor     ecx,[ebp+ 8]
+    xor     edx,[ebp+12]
+
+; determine the number of rounds
+
+%ifndef AES_256
+    cmp     edi,10*16
+    je      .3
+    cmp     edi,12*16
+    je      .2
+    cmp     edi,14*16
+    je      .1
+    mov     eax,-1
+    jmp     .5
+%endif
+
+.1: mf_call dec_round
+    mf_call dec_round
+.2: mf_call dec_round
+    mf_call dec_round
+.3: mf_call dec_round
+    mf_call dec_round
+    mf_call dec_round
+    mf_call dec_round
+    mf_call dec_round
+    mf_call dec_round
+    mf_call dec_round
+    mf_call dec_round
+    mf_call dec_round
+    dec_last_round
+
+; move final values to the output array.
+
+    movzx   ebp,word [esp+out_blk+stk_spc]
+    mov     [ebp],eax
+    mov     [ebp+4],ebx
+    mov     [ebp+8],esi
+    mov     [ebp+12],edi
+    xor     eax,eax
+
+.5: mov     ebp,[esp+16]
+    mov     ebx,[esp+12]
+    mov     esi,[esp+ 8]
+    mov     edi,[esp+ 4]
+    add     esp,stk_spc
+    do_exit 12
+
+%endif
+
+%ifdef REDUCE_CODE_SIZE
+
+inv_mix_col:
+    movzx   ecx,dl          ; input  eax, edx
+    movzx   ecx,etab_b(ecx) ; output eax
+    mov     eax,dtab_0(ecx) ; used   ecx
+    movzx   ecx,dh
+    shr     edx,16
+    movzx   ecx,etab_b(ecx)
+    xor     eax,dtab_1(ecx)
+    movzx   ecx,dl
+    movzx   ecx,etab_b(ecx)
+    xor     eax,dtab_2(ecx)
+    movzx   ecx,dh
+    movzx   ecx,etab_b(ecx)
+    xor     eax,dtab_3(ecx)
+    ret
+
+%else
+
+%macro  inv_mix_col 0   
+
+    movzx   ecx,dl          ; input  eax, edx
+    movzx   ecx,etab_b(ecx) ; output eax
+    mov     eax,dtab_0(ecx) ; used   ecx
+    movzx   ecx,dh
+    shr     edx,16
+    movzx   ecx,etab_b(ecx)
+    xor     eax,dtab_1(ecx)
+    movzx   ecx,dl
+    movzx   ecx,etab_b(ecx)
+    xor     eax,dtab_2(ecx)
+    movzx   ecx,dh
+    movzx   ecx,etab_b(ecx)
+    xor     eax,dtab_3(ecx)
+
+%endmacro
+
+%endif
+
+%ifdef DECRYPTION_KEY_SCHEDULE
+
+%ifdef AES_128
+
+%ifndef DECRYPTION_TABLE
+; %define DECRYPTION_TABLE
+%endif
+
+    do_name _aes_decrypt_key128,8
+
+    push    ebp
+    push    ebx
+    push    esi
+    push    edi
+    mov     eax,[esp+24]    ; context
+    mov     edx,[esp+20]    ; key
+    push    eax
+    push    edx
+    do_call _aes_encrypt_key128,8   ; generate expanded encryption key
+    mov     eax,10*16
+    mov     esi,[esp+24]    ; pointer to first round key
+    lea     edi,[esi+eax]   ; pointer to last round key
+    add     esi,32
+                            ; the inverse mix column transformation
+    mov     edx,[esi-16]    ; needs to be applied to all round keys
+    mf_call inv_mix_col     ; except first and last. Hence start by
+    mov     [esi-16],eax    ; transforming the four sub-keys in the
+    mov     edx,[esi-12]    ; second round key
+    mf_call inv_mix_col
+    mov     [esi-12],eax    ; transformations for subsequent rounds
+    mov     edx,[esi-8]     ; can then be made more efficient by
+    mf_call inv_mix_col     ; noting that for three of the four sub-keys
+    mov     [esi-8],eax     ; in the encryption round key ek[r]:
+    mov     edx,[esi-4]     ;
+    mf_call inv_mix_col     ;   ek[r][n] = ek[r][n-1] ^ ek[r-1][n]
+    mov     [esi-4],eax     ;
+                            ; where n is 1..3. Hence the corresponding
+.0: mov     edx,[esi]       ; subkeys in the decryption round key dk[r]
+    mf_call inv_mix_col     ; also obey since inv_mix_col is linear in
+    mov     [esi],eax       ; GF(256):
+    xor     eax,[esi-12]    ;
+    mov     [esi+4],eax     ;   dk[r][n] = dk[r][n-1] ^ dk[r-1][n]
+    xor     eax,[esi-8]     ;
+    mov     [esi+8],eax     ; So we only need one inverse mix column
+    xor     eax,[esi-4]     ; operation (n = 0) for each four word cycle
+    mov     [esi+12],eax    ; in the expanded key.
+    add     esi,16
+    cmp     edi,esi
+    jg      .0
+    jmp     dec_end
+
+%endif
+
+%ifdef AES_192
+
+%ifndef DECRYPTION_TABLE
+; %define DECRYPTION_TABLE
+%endif
+
+    do_name _aes_decrypt_key192,8
+
+    push    ebp
+    push    ebx
+    push    esi
+    push    edi
+    mov     eax,[esp+24]    ; context
+    mov     edx,[esp+20]    ; key
+    push    eax
+    push    edx
+    do_call _aes_encrypt_key192,8   ; generate expanded encryption key
+    mov     eax,12*16
+    mov     esi,[esp+24]    ; first round key
+    lea     edi,[esi+eax]   ; last round key
+    add     esi,48          ; the first 6 words are the key, of
+                            ; which the top 2 words are part of
+    mov     edx,[esi-32]    ; the second round key and hence
+    mf_call inv_mix_col     ; need to be modified. After this we
+    mov     [esi-32],eax    ; need to do a further six values prior
+    mov     edx,[esi-28]    ; to using a more efficient technique
+    mf_call inv_mix_col     ; based on:
+    mov     [esi-28],eax    ;
+                            ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n]
+    mov     edx,[esi-24]    ;
+    mf_call inv_mix_col     ; for n = 1 .. 5 where the key expansion
+    mov     [esi-24],eax    ; cycle is now 6 words long
+    mov     edx,[esi-20]
+    mf_call inv_mix_col
+    mov     [esi-20],eax
+    mov     edx,[esi-16]
+    mf_call inv_mix_col
+    mov     [esi-16],eax
+    mov     edx,[esi-12]
+    mf_call inv_mix_col
+    mov     [esi-12],eax
+    mov     edx,[esi-8]
+    mf_call inv_mix_col
+    mov     [esi-8],eax
+    mov     edx,[esi-4]
+    mf_call inv_mix_col
+    mov     [esi-4],eax
+
+.0: mov     edx,[esi]       ; the expanded key is 13 * 4 = 44 32-bit words
+    mf_call inv_mix_col     ; of which 11 * 4 = 44 have to be modified
+    mov     [esi],eax       ; using inv_mix_col.  We have already done 8
+    xor     eax,[esi-20]    ; of these so 36 are left - hence we need
+    mov     [esi+4],eax     ; exactly 6 loops of six here
+    xor     eax,[esi-16]
+    mov     [esi+8],eax
+    xor     eax,[esi-12]
+    mov     [esi+12],eax
+    xor     eax,[esi-8]
+    mov     [esi+16],eax
+    xor     eax,[esi-4]
+    mov     [esi+20],eax
+    add     esi,24
+    cmp     edi,esi
+    jg      .0
+    jmp     dec_end
+
+%endif
+
+%ifdef AES_256
+
+%ifndef DECRYPTION_TABLE
+; %define DECRYPTION_TABLE
+%endif
+
+    do_name _aes_decrypt_key256,8
+    
+    mov		ax, sp
+	movzx	esp, ax
+    push    ebp
+    push    ebx
+    push    esi
+    push    edi
+    
+    movzx   eax, word [esp+20] ; ks
+    movzx   edx, word [esp+18] ; key
+    push    ax
+    push    dx
+    do_call _aes_encrypt_key256,4   ; generate expanded encryption key
+    mov     eax,14*16
+    movzx   esi, word [esp+20] ; ks
+    lea     edi,[esi+eax]
+    add     esi,64
+
+    mov     edx,[esi-48]    ; the primary key is 8 words, of which
+    mf_call inv_mix_col     ; the top four require modification
+    mov     [esi-48],eax
+    mov     edx,[esi-44]
+    mf_call inv_mix_col
+    mov     [esi-44],eax
+    mov     edx,[esi-40]
+    mf_call inv_mix_col
+    mov     [esi-40],eax
+    mov     edx,[esi-36]
+    mf_call inv_mix_col
+    mov     [esi-36],eax
+
+    mov     edx,[esi-32]    ; the encryption key expansion cycle is
+    mf_call inv_mix_col     ; now eight words long so we need to
+    mov     [esi-32],eax    ; start by doing one complete block
+    mov     edx,[esi-28]
+    mf_call inv_mix_col
+    mov     [esi-28],eax
+    mov     edx,[esi-24]
+    mf_call inv_mix_col
+    mov     [esi-24],eax
+    mov     edx,[esi-20]
+    mf_call inv_mix_col
+    mov     [esi-20],eax
+    mov     edx,[esi-16]
+    mf_call inv_mix_col
+    mov     [esi-16],eax
+    mov     edx,[esi-12]
+    mf_call inv_mix_col
+    mov     [esi-12],eax
+    mov     edx,[esi-8]
+    mf_call inv_mix_col
+    mov     [esi-8],eax
+    mov     edx,[esi-4]
+    mf_call inv_mix_col
+    mov     [esi-4],eax
+
+.0: mov     edx,[esi]       ; we can now speed up the remaining
+    mf_call inv_mix_col     ; rounds by using the technique
+    mov     [esi],eax       ; outlined earlier.  But note that
+    xor     eax,[esi-28]    ; there is one extra inverse mix
+    mov     [esi+4],eax     ; column operation as the 256 bit
+    xor     eax,[esi-24]    ; key has an extra non-linear step
+    mov     [esi+8],eax     ; for the midway element.
+    xor     eax,[esi-20]
+    mov     [esi+12],eax    ; the expanded key is 15 * 4 = 60
+    mov     edx,[esi+16]    ; 32-bit words of which 52 need to
+    mf_call inv_mix_col     ; be modified.  We have already done
+    mov     [esi+16],eax    ; 12 so 40 are left - which means
+    xor     eax,[esi-12]    ; that we need exactly 5 loops of 8
+    mov     [esi+20],eax
+    xor     eax,[esi-8]
+    mov     [esi+24],eax
+    xor     eax,[esi-4]
+    mov     [esi+28],eax
+    add     esi,32
+    cmp     edi,esi
+    jg      .0
+
+%endif
+
+dec_end:
+
+%ifdef AES_REV_DKS
+
+    movzx   esi,word [esp+20]	; this reverses the order of the
+.1: mov     eax,[esi]			; round keys if required
+    mov     ebx,[esi+4]
+    mov     ebp,[edi]
+    mov     edx,[edi+4]
+    mov     [esi],ebp
+    mov     [esi+4],edx
+    mov     [edi],eax
+    mov     [edi+4],ebx
+
+    mov     eax,[esi+8]
+    mov     ebx,[esi+12]
+    mov     ebp,[edi+8]
+    mov     edx,[edi+12]
+    mov     [esi+8],ebp
+    mov     [esi+12],edx
+    mov     [edi+8],eax
+    mov     [edi+12],ebx
+
+    add     esi,16
+    sub     edi,16
+    cmp     edi,esi
+    jg      .1
+
+%endif
+
+    pop     edi
+    pop     esi
+    pop     ebx
+    pop     ebp
+    xor     eax,eax
+    do_exit  8
+
+%ifdef AES_VAR
+
+    do_name _aes_decrypt_key,12
+
+    mov     ecx,[esp+4]
+    mov     eax,[esp+8]
+    mov     edx,[esp+12]
+    push    edx
+    push    ecx
+
+    cmp     eax,16
+    je      .1
+    cmp     eax,128
+    je      .1
+
+    cmp     eax,24
+    je      .2
+    cmp     eax,192
+    je      .2
+
+    cmp     eax,32
+    je      .3
+    cmp     eax,256
+    je      .3
+    mov     eax,-1
+    add     esp,8
+    do_exit 12
+
+.1: do_call _aes_decrypt_key128,8
+    do_exit 12
+.2: do_call _aes_decrypt_key192,8
+    do_exit 12
+.3: do_call _aes_decrypt_key256,8
+    do_exit 12
+
+%endif
+
+%endif
+
+%ifdef DECRYPTION_TABLE
+
+; Inverse S-box data - 256 entries
+
+    section _DATA
+
+%define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
+
+_aes_dec_tab:
+    db  v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38)
+    db  v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb)
+    db  v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87)
+    db  v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb)
+    db  v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d)
+    db  v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e)
+    db  v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2)
+    db  v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25)
+    db  v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16)
+    db  v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92)
+    db  v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda)
+    db  v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84)
+    db  v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a)
+    db  v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06)
+    db  v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02)
+    db  v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b)
+    db  v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea)
+    db  v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73)
+    db  v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85)
+    db  v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e)
+    db  v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89)
+    db  v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b)
+    db  v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20)
+    db  v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4)
+    db  v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31)
+    db  v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f)
+    db  v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d)
+    db  v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef)
+    db  v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0)
+    db  v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61)
+    db  v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26)
+    db  v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d)
+
+%endif
diff --git a/src/Crypto/Aes_hw_cpu.asm b/src/Crypto/Aes_hw_cpu.asm
index 64c3bad8..53852665 100644
--- a/src/Crypto/Aes_hw_cpu.asm
+++ b/src/Crypto/Aes_hw_cpu.asm
@@ -1,330 +1,330 @@
-;
-; Copyright (c) 2010 TrueCrypt Developers Association. All rights reserved.
-;
-; Governed by the TrueCrypt License 3.0 the full text of which is contained in
-; the file License.txt included in TrueCrypt binary and source code distribution
-; packages.
-;
-
-
-%ifidn __BITS__, 16
-	%define R e
-%elifidn __BITS__, 32
-	%define R e
-%elifidn __BITS__, 64
-	%define R r
-%endif
-
-
-%macro export_function 1-2 0
-
-	%ifdef MS_STDCALL
-		global %1@%2
-		export _%1@%2
-	%1@%2:
-	%elifidn __BITS__, 16
-		global _%1
-	_%1:
-	%else
-		global %1
-	%1:
-	%endif
-
-%endmacro
-
-
-%macro aes_function_entry 1
-
-	; void (const byte *ks, byte *data);
-
-	export_function %1, 8
-
-	%ifidn __BITS__, 32
-		mov ecx, [esp + 4 + 4 * 0]
-		mov edx, [esp + 4 + 4 * 1]
-	%elifidn __BITS__, 64
-		%ifnidn __OUTPUT_FORMAT__, win64
-			mov rcx, rdi
-			mov rdx, rsi
-		%endif
-	%endif
-
-	; ecx/rcx = ks
-	; edx/rdx = data
-
-%endmacro
-
-
-%macro aes_function_exit 0
-
-	; void (const byte *, byte *);
-
-	%ifdef MS_STDCALL
-		ret 8
-	%else
-		ret
-	%endif
-
-%endmacro
-
-
-%macro push_xmm 2
-	sub rsp, 16 * (%2 - %1 + 1)
-
-	%assign stackoffset 0
-	%assign regnumber %1
-
-	%rep (%2 - %1 + 1)
-		movdqu [rsp + 16 * stackoffset], xmm%[regnumber]
-
-		%assign stackoffset stackoffset+1
-		%assign regnumber regnumber+1
-	%endrep
-%endmacro
-
-
-%macro pop_xmm 2
-	%assign stackoffset 0
-	%assign regnumber %1
-
-	%rep (%2 - %1 + 1)
-		movdqu xmm%[regnumber], [rsp + 16 * stackoffset]
-
-		%assign stackoffset stackoffset+1
-		%assign regnumber regnumber+1
-	%endrep
-
-	add rsp, 16 * (%2 - %1 + 1)
-%endmacro
-
-
-%macro aes_hw_cpu 2
-	%define OPERATION %1
-	%define BLOCK_COUNT %2
-
-	; Load data blocks
-	%assign block 1
-	%rep BLOCK_COUNT
-		movdqu xmm%[block], [%[R]dx + 16 * (block - 1)]
-		%assign block block+1
-	%endrep
-
-	; Encrypt/decrypt data blocks
-	%assign round 0
-	%rep 15
-		movdqu xmm0, [%[R]cx + 16 * round]
-
-		%assign block 1
-		%rep BLOCK_COUNT
-
-			%if round = 0
-				pxor xmm%[block], xmm0
-			%else
-				%if round < 14
-					aes%[OPERATION] xmm%[block], xmm0
-				%else
-					aes%[OPERATION]last xmm%[block], xmm0
-				%endif
-			%endif
-
-			%assign block block+1
-		%endrep
-
-		%assign round round+1
-	%endrep
-
-	; Store data blocks
-	%assign block 1
-	%rep BLOCK_COUNT
-		movdqu [%[R]dx + 16 * (block - 1)], xmm%[block]
-		%assign block block+1
-	%endrep
-
-	%undef OPERATION
-	%undef BLOCK_COUNT
-%endmacro
-
-
-%macro aes_hw_cpu_32_blocks 1
-	%define OPERATION_32_BLOCKS %1
-
-	%ifidn __BITS__, 64
-		%define MAX_REG_BLOCK_COUNT 15
-	%else
-		%define MAX_REG_BLOCK_COUNT 7
-	%endif
-
-	%ifidn __OUTPUT_FORMAT__, win64
-		%if MAX_REG_BLOCK_COUNT > 5
-			push_xmm 6, MAX_REG_BLOCK_COUNT
-		%endif
-	%endif
-
-		mov eax, 32 / MAX_REG_BLOCK_COUNT
-	.1:
-		aes_hw_cpu %[OPERATION_32_BLOCKS], MAX_REG_BLOCK_COUNT
-
-		add %[R]dx, 16 * MAX_REG_BLOCK_COUNT
-		dec eax
-		jnz .1
-
-	%if (32 % MAX_REG_BLOCK_COUNT) != 0
-		aes_hw_cpu %[OPERATION_32_BLOCKS], (32 % MAX_REG_BLOCK_COUNT)
-	%endif
-
-	%ifidn __OUTPUT_FORMAT__, win64
-		%if MAX_REG_BLOCK_COUNT > 5
-			pop_xmm 6, MAX_REG_BLOCK_COUNT
-		%endif
-	%endif
-
-	%undef OPERATION_32_BLOCKS
-	%undef MAX_REG_BLOCK_COUNT
-%endmacro
-
-
-%ifidn __BITS__, 16
-
-	USE16
-	SEGMENT _TEXT PUBLIC CLASS=CODE USE16
-	SEGMENT _DATA PUBLIC CLASS=DATA USE16
-	GROUP DGROUP _TEXT _DATA
-	SECTION _TEXT
-
-%else
-
-	SECTION .text
-
-%endif
-
-
-; void aes_hw_cpu_enable_sse ();
-
-	export_function aes_hw_cpu_enable_sse
-		mov %[R]ax, cr4
-		or ax, 1 << 9
-		mov cr4, %[R]ax
-	ret
-
-
-%ifidn __BITS__, 16
-
-
-; byte is_aes_hw_cpu_supported ();
-
-	export_function is_aes_hw_cpu_supported
-		mov eax, 1
-		cpuid
-		mov eax, ecx
-		shr eax, 25
-		and al, 1
-	ret
-
-
-; void aes_hw_cpu_decrypt (const byte *ks, byte *data);
-
-	export_function aes_hw_cpu_decrypt
-		mov ax, -16
-		jmp aes_hw_cpu_encrypt_decrypt
-
-; void aes_hw_cpu_encrypt (const byte *ks, byte *data);
-
-	export_function aes_hw_cpu_encrypt
-		mov ax, 16
-
-	aes_hw_cpu_encrypt_decrypt:
-		push bp
-		mov bp, sp
-		push di
-		push si
-
-		mov si, [bp + 4]			; ks
-		mov di, [bp + 4 + 2]		; data
-
-		movdqu xmm0, [si]
-		movdqu xmm1, [di]
-
-		pxor xmm1, xmm0
-
-		mov cx, 13
-
-	.round1_13:
-		add si, ax
-		movdqu xmm0, [si]
-
-		cmp ax, 0
-		jl .decrypt
-		
-		aesenc xmm1, xmm0
-		jmp .2
-	.decrypt:
-		aesdec xmm1, xmm0
-	.2:
-		loop .round1_13
-
-		add si, ax
-		movdqu xmm0, [si]
-		
-		cmp ax, 0
-		jl .decrypt_last
-
-		aesenclast xmm1, xmm0
-		jmp .3
-	.decrypt_last:
-		aesdeclast xmm1, xmm0
-	.3:
-		movdqu [di], xmm1
-
-		pop si
-		pop di
-		pop bp
-	ret
-
-
-%else	; __BITS__ != 16
-
-
-; byte is_aes_hw_cpu_supported ();
-
-	export_function is_aes_hw_cpu_supported
-		push %[R]bx
-
-		mov eax, 1
-		cpuid
-		mov eax, ecx
-		shr eax, 25
-		and eax, 1
-
-		pop %[R]bx
-	ret
-
-
-; void aes_hw_cpu_decrypt (const byte *ks, byte *data);
-
-	aes_function_entry aes_hw_cpu_decrypt
-		aes_hw_cpu dec, 1
-	aes_function_exit
-
-
-; void aes_hw_cpu_decrypt_32_blocks (const byte *ks, byte *data);
-
-	aes_function_entry aes_hw_cpu_decrypt_32_blocks
-		aes_hw_cpu_32_blocks dec
-	aes_function_exit
-
-
-; void aes_hw_cpu_encrypt (const byte *ks, byte *data);
-
-	aes_function_entry aes_hw_cpu_encrypt
-		aes_hw_cpu enc, 1
-	aes_function_exit
-
-
-; void aes_hw_cpu_encrypt_32_blocks (const byte *ks, byte *data);
-
-	aes_function_entry aes_hw_cpu_encrypt_32_blocks
-		aes_hw_cpu_32_blocks enc
-	aes_function_exit
-
-
-%endif	; __BITS__ != 16
+;
+; Copyright (c) 2010 TrueCrypt Developers Association. All rights reserved.
+;
+; Governed by the TrueCrypt License 3.0 the full text of which is contained in
+; the file License.txt included in TrueCrypt binary and source code distribution
+; packages.
+;
+
+
+%ifidn __BITS__, 16
+	%define R e
+%elifidn __BITS__, 32
+	%define R e
+%elifidn __BITS__, 64
+	%define R r
+%endif
+
+
+%macro export_function 1-2 0
+
+	%ifdef MS_STDCALL
+		global %1@%2
+		export _%1@%2
+	%1@%2:
+	%elifidn __BITS__, 16
+		global _%1
+	_%1:
+	%else
+		global %1
+	%1:
+	%endif
+
+%endmacro
+
+
+%macro aes_function_entry 1
+
+	; void (const byte *ks, byte *data);
+
+	export_function %1, 8
+
+	%ifidn __BITS__, 32
+		mov ecx, [esp + 4 + 4 * 0]
+		mov edx, [esp + 4 + 4 * 1]
+	%elifidn __BITS__, 64
+		%ifnidn __OUTPUT_FORMAT__, win64
+			mov rcx, rdi
+			mov rdx, rsi
+		%endif
+	%endif
+
+	; ecx/rcx = ks
+	; edx/rdx = data
+
+%endmacro
+
+
+%macro aes_function_exit 0
+
+	; void (const byte *, byte *);
+
+	%ifdef MS_STDCALL
+		ret 8
+	%else
+		ret
+	%endif
+
+%endmacro
+
+
+%macro push_xmm 2
+	sub rsp, 16 * (%2 - %1 + 1)
+
+	%assign stackoffset 0
+	%assign regnumber %1
+
+	%rep (%2 - %1 + 1)
+		movdqu [rsp + 16 * stackoffset], xmm%[regnumber]
+
+		%assign stackoffset stackoffset+1
+		%assign regnumber regnumber+1
+	%endrep
+%endmacro
+
+
+%macro pop_xmm 2
+	%assign stackoffset 0
+	%assign regnumber %1
+
+	%rep (%2 - %1 + 1)
+		movdqu xmm%[regnumber], [rsp + 16 * stackoffset]
+
+		%assign stackoffset stackoffset+1
+		%assign regnumber regnumber+1
+	%endrep
+
+	add rsp, 16 * (%2 - %1 + 1)
+%endmacro
+
+
+%macro aes_hw_cpu 2
+	%define OPERATION %1
+	%define BLOCK_COUNT %2
+
+	; Load data blocks
+	%assign block 1
+	%rep BLOCK_COUNT
+		movdqu xmm%[block], [%[R]dx + 16 * (block - 1)]
+		%assign block block+1
+	%endrep
+
+	; Encrypt/decrypt data blocks
+	%assign round 0
+	%rep 15
+		movdqu xmm0, [%[R]cx + 16 * round]
+
+		%assign block 1
+		%rep BLOCK_COUNT
+
+			%if round = 0
+				pxor xmm%[block], xmm0
+			%else
+				%if round < 14
+					aes%[OPERATION] xmm%[block], xmm0
+				%else
+					aes%[OPERATION]last xmm%[block], xmm0
+				%endif
+			%endif
+
+			%assign block block+1
+		%endrep
+
+		%assign round round+1
+	%endrep
+
+	; Store data blocks
+	%assign block 1
+	%rep BLOCK_COUNT
+		movdqu [%[R]dx + 16 * (block - 1)], xmm%[block]
+		%assign block block+1
+	%endrep
+
+	%undef OPERATION
+	%undef BLOCK_COUNT
+%endmacro
+
+
+%macro aes_hw_cpu_32_blocks 1
+	%define OPERATION_32_BLOCKS %1
+
+	%ifidn __BITS__, 64
+		%define MAX_REG_BLOCK_COUNT 15
+	%else
+		%define MAX_REG_BLOCK_COUNT 7
+	%endif
+
+	%ifidn __OUTPUT_FORMAT__, win64
+		%if MAX_REG_BLOCK_COUNT > 5
+			push_xmm 6, MAX_REG_BLOCK_COUNT
+		%endif
+	%endif
+
+		mov eax, 32 / MAX_REG_BLOCK_COUNT
+	.1:
+		aes_hw_cpu %[OPERATION_32_BLOCKS], MAX_REG_BLOCK_COUNT
+
+		add %[R]dx, 16 * MAX_REG_BLOCK_COUNT
+		dec eax
+		jnz .1
+
+	%if (32 % MAX_REG_BLOCK_COUNT) != 0
+		aes_hw_cpu %[OPERATION_32_BLOCKS], (32 % MAX_REG_BLOCK_COUNT)
+	%endif
+
+	%ifidn __OUTPUT_FORMAT__, win64
+		%if MAX_REG_BLOCK_COUNT > 5
+			pop_xmm 6, MAX_REG_BLOCK_COUNT
+		%endif
+	%endif
+
+	%undef OPERATION_32_BLOCKS
+	%undef MAX_REG_BLOCK_COUNT
+%endmacro
+
+
+%ifidn __BITS__, 16
+
+	USE16
+	SEGMENT _TEXT PUBLIC CLASS=CODE USE16
+	SEGMENT _DATA PUBLIC CLASS=DATA USE16
+	GROUP DGROUP _TEXT _DATA
+	SECTION _TEXT
+
+%else
+
+	SECTION .text
+
+%endif
+
+
+; void aes_hw_cpu_enable_sse ();
+
+	export_function aes_hw_cpu_enable_sse
+		mov %[R]ax, cr4
+		or ax, 1 << 9
+		mov cr4, %[R]ax
+	ret
+
+
+%ifidn __BITS__, 16
+
+
+; byte is_aes_hw_cpu_supported ();
+
+	export_function is_aes_hw_cpu_supported
+		mov eax, 1
+		cpuid
+		mov eax, ecx
+		shr eax, 25
+		and al, 1
+	ret
+
+
+; void aes_hw_cpu_decrypt (const byte *ks, byte *data);
+
+	export_function aes_hw_cpu_decrypt
+		mov ax, -16
+		jmp aes_hw_cpu_encrypt_decrypt
+
+; void aes_hw_cpu_encrypt (const byte *ks, byte *data);
+
+	export_function aes_hw_cpu_encrypt
+		mov ax, 16
+
+	aes_hw_cpu_encrypt_decrypt:
+		push bp
+		mov bp, sp
+		push di
+		push si
+
+		mov si, [bp + 4]			; ks
+		mov di, [bp + 4 + 2]		; data
+
+		movdqu xmm0, [si]
+		movdqu xmm1, [di]
+
+		pxor xmm1, xmm0
+
+		mov cx, 13
+
+	.round1_13:
+		add si, ax
+		movdqu xmm0, [si]
+
+		cmp ax, 0
+		jl .decrypt
+		
+		aesenc xmm1, xmm0
+		jmp .2
+	.decrypt:
+		aesdec xmm1, xmm0
+	.2:
+		loop .round1_13
+
+		add si, ax
+		movdqu xmm0, [si]
+		
+		cmp ax, 0
+		jl .decrypt_last
+
+		aesenclast xmm1, xmm0
+		jmp .3
+	.decrypt_last:
+		aesdeclast xmm1, xmm0
+	.3:
+		movdqu [di], xmm1
+
+		pop si
+		pop di
+		pop bp
+	ret
+
+
+%else	; __BITS__ != 16
+
+
+; byte is_aes_hw_cpu_supported ();
+
+	export_function is_aes_hw_cpu_supported
+		push %[R]bx
+
+		mov eax, 1
+		cpuid
+		mov eax, ecx
+		shr eax, 25
+		and eax, 1
+
+		pop %[R]bx
+	ret
+
+
+; void aes_hw_cpu_decrypt (const byte *ks, byte *data);
+
+	aes_function_entry aes_hw_cpu_decrypt
+		aes_hw_cpu dec, 1
+	aes_function_exit
+
+
+; void aes_hw_cpu_decrypt_32_blocks (const byte *ks, byte *data);
+
+	aes_function_entry aes_hw_cpu_decrypt_32_blocks
+		aes_hw_cpu_32_blocks dec
+	aes_function_exit
+
+
+; void aes_hw_cpu_encrypt (const byte *ks, byte *data);
+
+	aes_function_entry aes_hw_cpu_encrypt
+		aes_hw_cpu enc, 1
+	aes_function_exit
+
+
+; void aes_hw_cpu_encrypt_32_blocks (const byte *ks, byte *data);
+
+	aes_function_entry aes_hw_cpu_encrypt_32_blocks
+		aes_hw_cpu_32_blocks enc
+	aes_function_exit
+
+
+%endif	; __BITS__ != 16
diff --git a/src/Crypto/Aes_hw_cpu.h b/src/Crypto/Aes_hw_cpu.h
index 2342b4c5..e2fed1a1 100644
--- a/src/Crypto/Aes_hw_cpu.h
+++ b/src/Crypto/Aes_hw_cpu.h
@@ -8,27 +8,27 @@
  and are governed by the Apache License 2.0 the full text of which is
  contained in the file License.txt included in VeraCrypt binary and source
  code distribution packages.
-*/
-
-#ifndef TC_HEADER_Crypto_Aes_Hw_Cpu
-#define TC_HEADER_Crypto_Aes_Hw_Cpu
-
-#include "Common/Tcdefs.h"
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-byte is_aes_hw_cpu_supported ();
-void aes_hw_cpu_enable_sse ();
-void aes_hw_cpu_decrypt (const byte *ks, byte *data);
-void aes_hw_cpu_decrypt_32_blocks (const byte *ks, byte *data);
-void aes_hw_cpu_encrypt (const byte *ks, byte *data);
-void aes_hw_cpu_encrypt_32_blocks (const byte *ks, byte *data);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif // TC_HEADER_Crypto_Aes_Hw_Cpu
+*/
+
+#ifndef TC_HEADER_Crypto_Aes_Hw_Cpu
+#define TC_HEADER_Crypto_Aes_Hw_Cpu
+
+#include "Common/Tcdefs.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+byte is_aes_hw_cpu_supported ();
+void aes_hw_cpu_enable_sse ();
+void aes_hw_cpu_decrypt (const byte *ks, byte *data);
+void aes_hw_cpu_decrypt_32_blocks (const byte *ks, byte *data);
+void aes_hw_cpu_encrypt (const byte *ks, byte *data);
+void aes_hw_cpu_encrypt_32_blocks (const byte *ks, byte *data);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // TC_HEADER_Crypto_Aes_Hw_Cpu
diff --git a/src/Crypto/Aes_x64.asm b/src/Crypto/Aes_x64.asm
index b29fdcac..06d57ac2 100644
--- a/src/Crypto/Aes_x64.asm
+++ b/src/Crypto/Aes_x64.asm
@@ -1,907 +1,907 @@
-
-; ---------------------------------------------------------------------------
-; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
-; 
-; LICENSE TERMS
-; 
-; The free distribution and use of this software is allowed (with or without
-; changes) provided that:
-; 
-;  1. source code distributions include the above copyright notice, this
-;     list of conditions and the following disclaimer;
-; 
-;  2. binary distributions include the above copyright notice, this list
-;     of conditions and the following disclaimer in their documentation;
-; 
-;  3. the name of the copyright holder is not used to endorse products
-;     built using this software without specific written permission.
-; 
-; DISCLAIMER
-; 
-; This software is provided 'as is' with no explicit or implied warranties
-; in respect of its properties, including, but not limited to, correctness
-; and/or fitness for purpose.
-; ---------------------------------------------------------------------------
-; Issue 20/12/2007
-;
-; I am grateful to Dag Arne Osvik for many discussions of the techniques that
-; can be used to optimise AES assembler code on AMD64/EM64T architectures.
-; Some of the techniques used in this implementation are the result of
-; suggestions made by him for which I am most grateful.
-
-;
-; Adapted for TrueCrypt:
-; - Compatibility with NASM
-;
-
-; An AES implementation for AMD64 processors using the YASM assembler.  This
-; implemetation provides only encryption, decryption and hence requires key
-; scheduling support in C. It uses 8k bytes of tables but its encryption and
-; decryption performance is very close to that obtained using large tables.
-; It can use either Windows or Gnu/Linux calling conventions, which are as
-; follows:
-;               windows  gnu/linux
-;
-;   in_blk          rcx     rdi
-;   out_blk         rdx     rsi
-;   context (cx)     r8     rdx
-;
-;   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
-;   registers       rdi      -      on both
-;
-;   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
-;   registers        -      rdi     on both
-;
-; The default convention is that for windows, the gnu/linux convention being
-; used if __GNUC__ is defined.
-;
-; Define _SEH_ to include support for Win64 structured exception handling
-; (this requires YASM version 0.6 or later).
-;
-; This code provides the standard AES block size (128 bits, 16 bytes) and the
-; three standard AES key sizes (128, 192 and 256 bits). It has the same call
-; interface as my C implementation.  It uses the Microsoft C AMD64 calling
-; conventions in which the three parameters are placed in  rcx, rdx and r8
-; respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
-;
-;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
-;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
-;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
-;                                            const aes_encrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
-;                                            const aes_decrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_encrypt_key(const unsigned char key[],
-;                           unsigned int len, const aes_decrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_decrypt_key(const unsigned char key[],
-;                           unsigned int len, const aes_decrypt_ctx cx[1]);
-;
-; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
-; either bits or bytes.
-;
-; Comment in/out the following lines to obtain the desired subroutines. These
-; selections MUST match those in the C header file aes.h
-
-; %define AES_128                 ; define if AES with 128 bit keys is needed
-; %define AES_192                 ; define if AES with 192 bit keys is needed
-%define AES_256                 ; define if AES with 256 bit keys is needed
-; %define AES_VAR                 ; define if a variable key size is needed
-%define ENCRYPTION              ; define if encryption is needed
-%define DECRYPTION              ; define if decryption is needed
-%define AES_REV_DKS             ; define if key decryption schedule is reversed
-%define LAST_ROUND_TABLES       ; define for the faster version using extra tables
-
-; The encryption key schedule has the following in memory layout where N is the
-; number of rounds (10, 12 or 14):
-;
-; lo: | input key (round 0)  |  ; each round is four 32-bit words
-;     | encryption round 1   |
-;     | encryption round 2   |
-;     ....
-;     | encryption round N-1 |
-; hi: | encryption round N   |
-;
-; The decryption key schedule is normally set up so that it has the same
-; layout as above by actually reversing the order of the encryption key
-; schedule in memory (this happens when AES_REV_DKS is set):
-;
-; lo: | decryption round 0   | =              | encryption round N   |
-;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
-;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
-;     ....                       ....
-;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
-; hi: | decryption round N   | =              | input key (round 0)  |
-;
-; with rounds except the first and last modified using inv_mix_column()
-; But if AES_REV_DKS is NOT set the order of keys is left as it is for
-; encryption so that it has to be accessed in reverse when used for
-; decryption (although the inverse mix column modifications are done)
-;
-; lo: | decryption round 0   | =              | input key (round 0)  |
-;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
-;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
-;     ....                       ....
-;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
-; hi: | decryption round N   | =              | encryption round N   |
-;
-; This layout is faster when the assembler key scheduling provided here
-; is used.
-;
-; The DLL interface must use the _stdcall convention in which the number
-; of bytes of parameter space is added after an @ to the sutine's name.
-; We must also remove our parameters from the stack before return (see
-; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
-
-;%define DLL_EXPORT
-
-; End of user defines
-
-%ifdef AES_VAR
-%ifndef AES_128
-%define AES_128
-%endif
-%ifndef AES_192
-%define AES_192
-%endif
-%ifndef AES_256
-%define AES_256
-%endif
-%endif
-
-%ifdef AES_VAR
-%define KS_LENGTH       60
-%elifdef AES_256
-%define KS_LENGTH       60
-%elifdef AES_192
-%define KS_LENGTH       52
-%else
-%define KS_LENGTH       44
-%endif
-
-%define     r0  rax
-%define     r1  rdx
-%define     r2  rcx
-%define     r3  rbx
-%define     r4  rsi
-%define     r5  rdi
-%define     r6  rbp
-%define     r7  rsp
-
-%define     raxd    eax
-%define     rdxd    edx
-%define     rcxd    ecx
-%define     rbxd    ebx
-%define     rsid    esi
-%define     rdid    edi
-%define     rbpd    ebp
-%define     rspd    esp
-
-%define     raxb    al
-%define     rdxb    dl
-%define     rcxb    cl
-%define     rbxb    bl
-%define     rsib    sil
-%define     rdib    dil
-%define     rbpb    bpl
-%define     rspb    spl
-
-%define     r0h ah
-%define     r1h dh
-%define     r2h ch
-%define     r3h bh
-
-%define     r0d eax
-%define     r1d edx
-%define     r2d ecx
-%define     r3d ebx
-
-; finite field multiplies by {02}, {04} and {08}
-
-%define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
-%define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
-%define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
-
-; finite field multiplies required in table generation
-
-%define f3(x)   (f2(x) ^ x)
-%define f9(x)   (f8(x) ^ x)
-%define fb(x)   (f8(x) ^ f2(x) ^ x)
-%define fd(x)   (f8(x) ^ f4(x) ^ x)
-%define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
-
-; macro for expanding S-box data
-
-%macro enc_vals 1
-    db  %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5)
-    db  %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76)
-    db  %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0)
-    db  %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0)
-    db  %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc)
-    db  %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15)
-    db  %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a)
-    db  %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75)
-    db  %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0)
-    db  %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84)
-    db  %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b)
-    db  %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf)
-    db  %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85)
-    db  %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8)
-    db  %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5)
-    db  %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2)
-    db  %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17)
-    db  %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73)
-    db  %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88)
-    db  %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb)
-    db  %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c)
-    db  %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79)
-    db  %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9)
-    db  %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08)
-    db  %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6)
-    db  %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a)
-    db  %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e)
-    db  %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e)
-    db  %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94)
-    db  %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf)
-    db  %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68)
-    db  %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16)
-%endmacro
-
-%macro dec_vals 1
-    db  %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38)
-    db  %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb)
-    db  %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87)
-    db  %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb)
-    db  %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d)
-    db  %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e)
-    db  %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2)
-    db  %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25)
-    db  %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16)
-    db  %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92)
-    db  %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda)
-    db  %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84)
-    db  %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a)
-    db  %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06)
-    db  %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02)
-    db  %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b)
-    db  %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea)
-    db  %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73)
-    db  %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85)
-    db  %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e)
-    db  %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89)
-    db  %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b)
-    db  %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20)
-    db  %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4)
-    db  %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31)
-    db  %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f)
-    db  %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d)
-    db  %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef)
-    db  %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0)
-    db  %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61)
-    db  %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26)
-    db  %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d)
-%endmacro
-
-%define u8(x)   f2(x), x, x, f3(x), f2(x), x, x, f3(x)
-%define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
-%define w8(x)   x, 0, 0, 0, x, 0, 0, 0
-
-%define tptr    rbp     ; table pointer
-%define kptr    r8      ; key schedule pointer
-%define fofs    128     ; adjust offset in key schedule to keep |disp| < 128
-%define fk_ref(x,y) [kptr-16*x+fofs+4*y]
-%ifdef  AES_REV_DKS
-%define rofs    128
-%define ik_ref(x,y) [kptr-16*x+rofs+4*y]
-%else
-%define rofs    -128
-%define ik_ref(x,y) [kptr+16*x+rofs+4*y]
-%endif
-
-%define tab_0(x)   [tptr+8*x]
-%define tab_1(x)   [tptr+8*x+3]
-%define tab_2(x)   [tptr+8*x+2]
-%define tab_3(x)   [tptr+8*x+1]
-%define tab_f(x)   byte [tptr+8*x+1]
-%define tab_i(x)   byte [tptr+8*x+7]
-%define t_ref(x,r) tab_ %+ x(r)
-
-%macro ff_rnd 5                 ; normal forward round
-    mov     %1d, fk_ref(%5,0)
-    mov     %2d, fk_ref(%5,1)
-    mov     %3d, fk_ref(%5,2)
-    mov     %4d, fk_ref(%5,3)
-
-    movzx   esi, al
-    movzx   edi, ah
-    shr     eax, 16
-    xor     %1d, t_ref(0,rsi)
-    xor     %4d, t_ref(1,rdi)
-    movzx   esi, al
-    movzx   edi, ah
-    xor     %3d, t_ref(2,rsi)
-    xor     %2d, t_ref(3,rdi)
-
-    movzx   esi, bl
-    movzx   edi, bh
-    shr     ebx, 16
-    xor     %2d, t_ref(0,rsi)
-    xor     %1d, t_ref(1,rdi)
-    movzx   esi, bl
-    movzx   edi, bh
-    xor     %4d, t_ref(2,rsi)
-    xor     %3d, t_ref(3,rdi)
-
-    movzx   esi, cl
-    movzx   edi, ch
-    shr     ecx, 16
-    xor     %3d, t_ref(0,rsi)
-    xor     %2d, t_ref(1,rdi)
-    movzx   esi, cl
-    movzx   edi, ch
-    xor     %1d, t_ref(2,rsi)
-    xor     %4d, t_ref(3,rdi)
-
-    movzx   esi, dl
-    movzx   edi, dh
-    shr     edx, 16
-    xor     %4d, t_ref(0,rsi)
-    xor     %3d, t_ref(1,rdi)
-    movzx   esi, dl
-    movzx   edi, dh
-    xor     %2d, t_ref(2,rsi)
-    xor     %1d, t_ref(3,rdi)
-
-    mov     eax,%1d
-    mov     ebx,%2d
-    mov     ecx,%3d
-    mov     edx,%4d
-%endmacro
-
-%ifdef LAST_ROUND_TABLES
-
-%macro fl_rnd 5                 ; last forward round
-    add     tptr, 2048
-    mov     %1d, fk_ref(%5,0)
-    mov     %2d, fk_ref(%5,1)
-    mov     %3d, fk_ref(%5,2)
-    mov     %4d, fk_ref(%5,3)
-
-    movzx   esi, al
-    movzx   edi, ah
-    shr     eax, 16
-    xor     %1d, t_ref(0,rsi)
-    xor     %4d, t_ref(1,rdi)
-    movzx   esi, al
-    movzx   edi, ah
-    xor     %3d, t_ref(2,rsi)
-    xor     %2d, t_ref(3,rdi)
-
-    movzx   esi, bl
-    movzx   edi, bh
-    shr     ebx, 16
-    xor     %2d, t_ref(0,rsi)
-    xor     %1d, t_ref(1,rdi)
-    movzx   esi, bl
-    movzx   edi, bh
-    xor     %4d, t_ref(2,rsi)
-    xor     %3d, t_ref(3,rdi)
-
-    movzx   esi, cl
-    movzx   edi, ch
-    shr     ecx, 16
-    xor     %3d, t_ref(0,rsi)
-    xor     %2d, t_ref(1,rdi)
-    movzx   esi, cl
-    movzx   edi, ch
-    xor     %1d, t_ref(2,rsi)
-    xor     %4d, t_ref(3,rdi)
-
-    movzx   esi, dl
-    movzx   edi, dh
-    shr     edx, 16
-    xor     %4d, t_ref(0,rsi)
-    xor     %3d, t_ref(1,rdi)
-    movzx   esi, dl
-    movzx   edi, dh
-    xor     %2d, t_ref(2,rsi)
-    xor     %1d, t_ref(3,rdi)
-%endmacro
-
-%else
-
-%macro fl_rnd 5                 ; last forward round
-    mov     %1d, fk_ref(%5,0)
-    mov     %2d, fk_ref(%5,1)
-    mov     %3d, fk_ref(%5,2)
-    mov     %4d, fk_ref(%5,3)
-
-    movzx   esi, al
-    movzx   edi, ah
-    shr     eax, 16
-    movzx   esi, t_ref(f,rsi)
-    movzx   edi, t_ref(f,rdi)
-    xor     %1d, esi
-    rol     edi, 8
-    xor     %4d, edi
-    movzx   esi, al
-    movzx   edi, ah
-    movzx   esi, t_ref(f,rsi)
-    movzx   edi, t_ref(f,rdi)
-    rol     esi, 16
-    rol     edi, 24
-    xor     %3d, esi
-    xor     %2d, edi
-
-    movzx   esi, bl
-    movzx   edi, bh
-    shr     ebx, 16
-    movzx   esi, t_ref(f,rsi)
-    movzx   edi, t_ref(f,rdi)
-    xor     %2d, esi
-    rol     edi, 8
-    xor     %1d, edi
-    movzx   esi, bl
-    movzx   edi, bh
-    movzx   esi, t_ref(f,rsi)
-    movzx   edi, t_ref(f,rdi)
-    rol     esi, 16
-    rol     edi, 24
-    xor     %4d, esi
-    xor     %3d, edi
-
-    movzx   esi, cl
-    movzx   edi, ch
-    movzx   esi, t_ref(f,rsi)
-    movzx   edi, t_ref(f,rdi)
-    shr     ecx, 16
-    xor     %3d, esi
-    rol     edi, 8
-    xor     %2d, edi
-    movzx   esi, cl
-    movzx   edi, ch
-    movzx   esi, t_ref(f,rsi)
-    movzx   edi, t_ref(f,rdi)
-    rol     esi, 16
-    rol     edi, 24
-    xor     %1d, esi
-    xor     %4d, edi
-
-    movzx   esi, dl
-    movzx   edi, dh
-    movzx   esi, t_ref(f,rsi)
-    movzx   edi, t_ref(f,rdi)
-    shr     edx, 16
-    xor     %4d, esi
-    rol     edi, 8
-    xor     %3d, edi
-    movzx   esi, dl
-    movzx   edi, dh
-    movzx   esi, t_ref(f,rsi)
-    movzx   edi, t_ref(f,rdi)
-    rol     esi, 16
-    rol     edi, 24
-    xor     %2d, esi
-    xor     %1d, edi
-%endmacro
-
-%endif
-
-%macro ii_rnd 5                 ; normal inverse round
-    mov     %1d, ik_ref(%5,0)
-    mov     %2d, ik_ref(%5,1)
-    mov     %3d, ik_ref(%5,2)
-    mov     %4d, ik_ref(%5,3)
-
-    movzx   esi, al
-    movzx   edi, ah
-    shr     eax, 16
-    xor     %1d, t_ref(0,rsi)
-    xor     %2d, t_ref(1,rdi)
-    movzx   esi, al
-    movzx   edi, ah
-    xor     %3d, t_ref(2,rsi)
-    xor     %4d, t_ref(3,rdi)
-
-    movzx   esi, bl
-    movzx   edi, bh
-    shr     ebx, 16
-    xor     %2d, t_ref(0,rsi)
-    xor     %3d, t_ref(1,rdi)
-    movzx   esi, bl
-    movzx   edi, bh
-    xor     %4d, t_ref(2,rsi)
-    xor     %1d, t_ref(3,rdi)
-
-    movzx   esi, cl
-    movzx   edi, ch
-    shr     ecx, 16
-    xor     %3d, t_ref(0,rsi)
-    xor     %4d, t_ref(1,rdi)
-    movzx   esi, cl
-    movzx   edi, ch
-    xor     %1d, t_ref(2,rsi)
-    xor     %2d, t_ref(3,rdi)
-
-    movzx   esi, dl
-    movzx   edi, dh
-    shr     edx, 16
-    xor     %4d, t_ref(0,rsi)
-    xor     %1d, t_ref(1,rdi)
-    movzx   esi, dl
-    movzx   edi, dh
-    xor     %2d, t_ref(2,rsi)
-    xor     %3d, t_ref(3,rdi)
-
-    mov     eax,%1d
-    mov     ebx,%2d
-    mov     ecx,%3d
-    mov     edx,%4d
-%endmacro
-
-%ifdef LAST_ROUND_TABLES
-
-%macro il_rnd 5                 ; last inverse round
-    add     tptr, 2048
-    mov     %1d, ik_ref(%5,0)
-    mov     %2d, ik_ref(%5,1)
-    mov     %3d, ik_ref(%5,2)
-    mov     %4d, ik_ref(%5,3)
-
-    movzx   esi, al
-    movzx   edi, ah
-    shr     eax, 16
-    xor     %1d, t_ref(0,rsi)
-    xor     %2d, t_ref(1,rdi)
-    movzx   esi, al
-    movzx   edi, ah
-    xor     %3d, t_ref(2,rsi)
-    xor     %4d, t_ref(3,rdi)
-
-    movzx   esi, bl
-    movzx   edi, bh
-    shr     ebx, 16
-    xor     %2d, t_ref(0,rsi)
-    xor     %3d, t_ref(1,rdi)
-    movzx   esi, bl
-    movzx   edi, bh
-    xor     %4d, t_ref(2,rsi)
-    xor     %1d, t_ref(3,rdi)
-
-    movzx   esi, cl
-    movzx   edi, ch
-    shr     ecx, 16
-    xor     %3d, t_ref(0,rsi)
-    xor     %4d, t_ref(1,rdi)
-    movzx   esi, cl
-    movzx   edi, ch
-    xor     %1d, t_ref(2,rsi)
-    xor     %2d, t_ref(3,rdi)
-
-    movzx   esi, dl
-    movzx   edi, dh
-    shr     edx, 16
-    xor     %4d, t_ref(0,rsi)
-    xor     %1d, t_ref(1,rdi)
-    movzx   esi, dl
-    movzx   edi, dh
-    xor     %2d, t_ref(2,rsi)
-    xor     %3d, t_ref(3,rdi)
-%endmacro
-
-%else
-
-%macro il_rnd 5                 ; last inverse round
-    mov     %1d, ik_ref(%5,0)
-    mov     %2d, ik_ref(%5,1)
-    mov     %3d, ik_ref(%5,2)
-    mov     %4d, ik_ref(%5,3)
-
-    movzx   esi, al
-    movzx   edi, ah
-    movzx   esi, t_ref(i,rsi)
-    movzx   edi, t_ref(i,rdi)
-    shr     eax, 16
-    xor     %1d, esi
-    rol     edi, 8
-    xor     %2d, edi
-    movzx   esi, al
-    movzx   edi, ah
-    movzx   esi, t_ref(i,rsi)
-    movzx   edi, t_ref(i,rdi)
-    rol     esi, 16
-    rol     edi, 24
-    xor     %3d, esi
-    xor     %4d, edi
-
-    movzx   esi, bl
-    movzx   edi, bh
-    movzx   esi, t_ref(i,rsi)
-    movzx   edi, t_ref(i,rdi)
-    shr     ebx, 16
-    xor     %2d, esi
-    rol     edi, 8
-    xor     %3d, edi
-    movzx   esi, bl
-    movzx   edi, bh
-    movzx   esi, t_ref(i,rsi)
-    movzx   edi, t_ref(i,rdi)
-    rol     esi, 16
-    rol     edi, 24
-    xor     %4d, esi
-    xor     %1d, edi
-
-    movzx   esi, cl
-    movzx   edi, ch
-    movzx   esi, t_ref(i,rsi)
-    movzx   edi, t_ref(i,rdi)
-    shr     ecx, 16
-    xor     %3d, esi
-    rol     edi, 8
-    xor     %4d, edi
-    movzx   esi, cl
-    movzx   edi, ch
-    movzx   esi, t_ref(i,rsi)
-    movzx   edi, t_ref(i,rdi)
-    rol     esi, 16
-    rol     edi, 24
-    xor     %1d, esi
-    xor     %2d, edi
-
-    movzx   esi, dl
-    movzx   edi, dh
-    movzx   esi, t_ref(i,rsi)
-    movzx   edi, t_ref(i,rdi)
-    shr     edx, 16
-    xor     %4d, esi
-    rol     edi, 8
-    xor     %1d, edi
-    movzx   esi, dl
-    movzx   edi, dh
-    movzx   esi, t_ref(i,rsi)
-    movzx   edi, t_ref(i,rdi)
-    rol     esi, 16
-    rol     edi, 24
-    xor     %2d, esi
-    xor     %3d, edi
-%endmacro
-
-%endif
-
-%ifdef ENCRYPTION
-
-    global  aes_encrypt
-%ifdef DLL_EXPORT
-    export  aes_encrypt
-%endif
-
-    section .data align=64
-    align   64
-enc_tab:
-    enc_vals u8
-%ifdef LAST_ROUND_TABLES
-    enc_vals w8
-%endif
-
-    section .text align=16
-    align   16
-
-%ifdef _SEH_
-proc_frame aes_encrypt
-	alloc_stack	7*8			; 7 to align stack to 16 bytes
-	save_reg	rsi,4*8
-	save_reg	rdi,5*8
-	save_reg	rbx,1*8
-	save_reg	rbp,2*8
-	save_reg	r12,3*8
-end_prologue
-    mov     rdi, rcx        ; input pointer
-    mov     [rsp+0*8], rdx  ; output pointer
-%else
-	aes_encrypt:
-	%ifdef __GNUC__
-		sub     rsp, 4*8        ; gnu/linux binary interface
-		mov     [rsp+0*8], rsi  ; output pointer
-		mov     r8, rdx         ; context
-	%else
-		sub     rsp, 6*8        ; windows binary interface
-		mov     [rsp+4*8], rsi
-		mov     [rsp+5*8], rdi
-		mov     rdi, rcx        ; input pointer
-		mov     [rsp+0*8], rdx  ; output pointer
-	%endif
-		mov     [rsp+1*8], rbx  ; input pointer in rdi
-		mov     [rsp+2*8], rbp  ; output pointer in [rsp]
-		mov     [rsp+3*8], r12  ; context in r8
-%endif
-
-    movzx   esi, byte [kptr+4*KS_LENGTH]
-    lea     tptr, [rel enc_tab]
-    sub     kptr, fofs
-
-    mov     eax, [rdi+0*4]
-    mov     ebx, [rdi+1*4]
-    mov     ecx, [rdi+2*4]
-    mov     edx, [rdi+3*4]
-
-    xor     eax, [kptr+fofs]
-    xor     ebx, [kptr+fofs+4]
-    xor     ecx, [kptr+fofs+8]
-    xor     edx, [kptr+fofs+12]
-
-    lea     kptr,[kptr+rsi]
-    cmp     esi, 10*16
-    je      .3
-    cmp     esi, 12*16
-    je      .2
-    cmp     esi, 14*16
-    je      .1
-    mov     rax, -1
-    jmp     .4
-
-.1: ff_rnd  r9, r10, r11, r12, 13
-    ff_rnd  r9, r10, r11, r12, 12
-.2: ff_rnd  r9, r10, r11, r12, 11
-    ff_rnd  r9, r10, r11, r12, 10
-.3: ff_rnd  r9, r10, r11, r12, 9
-    ff_rnd  r9, r10, r11, r12, 8
-    ff_rnd  r9, r10, r11, r12, 7
-    ff_rnd  r9, r10, r11, r12, 6
-    ff_rnd  r9, r10, r11, r12, 5
-    ff_rnd  r9, r10, r11, r12, 4
-    ff_rnd  r9, r10, r11, r12, 3
-    ff_rnd  r9, r10, r11, r12, 2
-    ff_rnd  r9, r10, r11, r12, 1
-    fl_rnd  r9, r10, r11, r12, 0
-
-    mov     rbx, [rsp]
-    mov     [rbx], r9d
-    mov     [rbx+4], r10d
-    mov     [rbx+8], r11d
-    mov     [rbx+12], r12d
-    xor     rax, rax
-.4:
-    mov     rbx, [rsp+1*8]
-    mov     rbp, [rsp+2*8]
-    mov     r12, [rsp+3*8]
-%ifdef __GNUC__
-    add     rsp, 4*8
-    ret
-%else
-		mov     rsi, [rsp+4*8]
-		mov     rdi, [rsp+5*8]
-	%ifdef _SEH_
-		add     rsp, 7*8
-		ret
-	endproc_frame
-	%else
-		add     rsp, 6*8
-		ret
-	%endif
-%endif
-
-%endif
-
-%ifdef DECRYPTION
-
-    global  aes_decrypt
-%ifdef DLL_EXPORT
-    export  aes_decrypt
-%endif
-
-    section .data
-    align   64
-dec_tab:
-    dec_vals v8
-%ifdef LAST_ROUND_TABLES
-    dec_vals w8
-%endif
-
-    section .text
-    align   16
-
-%ifdef _SEH_
-proc_frame aes_decrypt
-	alloc_stack	7*8			; 7 to align stack to 16 bytes
-	save_reg	rsi,4*8
-	save_reg	rdi,5*8
-	save_reg	rbx,1*8
-	save_reg	rbp,2*8
-	save_reg	r12,3*8
-end_prologue
-    mov     rdi, rcx        ; input pointer
-    mov     [rsp+0*8], rdx  ; output pointer
-%else
-	aes_decrypt:
-	%ifdef __GNUC__
-		sub     rsp, 4*8        ; gnu/linux binary interface
-		mov     [rsp+0*8], rsi  ; output pointer
-		mov     r8, rdx         ; context
-	%else
-		sub     rsp, 6*8        ; windows binary interface
-		mov     [rsp+4*8], rsi
-		mov     [rsp+5*8], rdi
-		mov     rdi, rcx        ; input pointer
-		mov     [rsp+0*8], rdx  ; output pointer
-	%endif
-		mov     [rsp+1*8], rbx  ; input pointer in rdi
-		mov     [rsp+2*8], rbp  ; output pointer in [rsp]
-		mov     [rsp+3*8], r12  ; context in r8
-%endif
-
-    movzx   esi,byte[kptr+4*KS_LENGTH]
-    lea     tptr, [rel dec_tab]
-    sub     kptr, rofs
-
-    mov     eax, [rdi+0*4]
-    mov     ebx, [rdi+1*4]
-    mov     ecx, [rdi+2*4]
-    mov     edx, [rdi+3*4]
-
-%ifdef      AES_REV_DKS
-    mov     rdi, kptr
-    lea     kptr,[kptr+rsi]
-%else
-    lea     rdi,[kptr+rsi]
-%endif
-
-    xor     eax, [rdi+rofs]
-    xor     ebx, [rdi+rofs+4]
-    xor     ecx, [rdi+rofs+8]
-    xor     edx, [rdi+rofs+12]
-
-    cmp     esi, 10*16
-    je      .3
-    cmp     esi, 12*16
-    je      .2
-    cmp     esi, 14*16
-    je      .1
-    mov     rax, -1
-    jmp     .4
-
-.1: ii_rnd  r9, r10, r11, r12, 13
-    ii_rnd  r9, r10, r11, r12, 12
-.2: ii_rnd  r9, r10, r11, r12, 11
-    ii_rnd  r9, r10, r11, r12, 10
-.3: ii_rnd  r9, r10, r11, r12, 9
-    ii_rnd  r9, r10, r11, r12, 8
-    ii_rnd  r9, r10, r11, r12, 7
-    ii_rnd  r9, r10, r11, r12, 6
-    ii_rnd  r9, r10, r11, r12, 5
-    ii_rnd  r9, r10, r11, r12, 4
-    ii_rnd  r9, r10, r11, r12, 3
-    ii_rnd  r9, r10, r11, r12, 2
-    ii_rnd  r9, r10, r11, r12, 1
-    il_rnd  r9, r10, r11, r12, 0
-
-    mov     rbx, [rsp]
-    mov     [rbx], r9d
-    mov     [rbx+4], r10d
-    mov     [rbx+8], r11d
-    mov     [rbx+12], r12d
-    xor     rax, rax
-.4: mov     rbx, [rsp+1*8]
-    mov     rbp, [rsp+2*8]
-    mov     r12, [rsp+3*8]
-%ifdef __GNUC__
-    add     rsp, 4*8
-    ret
-%else
-		mov     rsi, [rsp+4*8]
-		mov     rdi, [rsp+5*8]
-	%ifdef _SEH_
-		add     rsp, 7*8
-		ret
-	endproc_frame
-	%else
-		add     rsp, 6*8
-		ret
-	%endif
-%endif
-
-%endif
+
+; ---------------------------------------------------------------------------
+; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+; 
+; LICENSE TERMS
+; 
+; The free distribution and use of this software is allowed (with or without
+; changes) provided that:
+; 
+;  1. source code distributions include the above copyright notice, this
+;     list of conditions and the following disclaimer;
+; 
+;  2. binary distributions include the above copyright notice, this list
+;     of conditions and the following disclaimer in their documentation;
+; 
+;  3. the name of the copyright holder is not used to endorse products
+;     built using this software without specific written permission.
+; 
+; DISCLAIMER
+; 
+; This software is provided 'as is' with no explicit or implied warranties
+; in respect of its properties, including, but not limited to, correctness
+; and/or fitness for purpose.
+; ---------------------------------------------------------------------------
+; Issue 20/12/2007
+;
+; I am grateful to Dag Arne Osvik for many discussions of the techniques that
+; can be used to optimise AES assembler code on AMD64/EM64T architectures.
+; Some of the techniques used in this implementation are the result of
+; suggestions made by him for which I am most grateful.
+
+;
+; Adapted for TrueCrypt:
+; - Compatibility with NASM
+;
+
+; An AES implementation for AMD64 processors using the YASM assembler.  This
+; implemetation provides only encryption, decryption and hence requires key
+; scheduling support in C. It uses 8k bytes of tables but its encryption and
+; decryption performance is very close to that obtained using large tables.
+; It can use either Windows or Gnu/Linux calling conventions, which are as
+; follows:
+;               windows  gnu/linux
+;
+;   in_blk          rcx     rdi
+;   out_blk         rdx     rsi
+;   context (cx)     r8     rdx
+;
+;   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
+;   registers       rdi      -      on both
+;
+;   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
+;   registers        -      rdi     on both
+;
+; The default convention is that for windows, the gnu/linux convention being
+; used if __GNUC__ is defined.
+;
+; Define _SEH_ to include support for Win64 structured exception handling
+; (this requires YASM version 0.6 or later).
+;
+; This code provides the standard AES block size (128 bits, 16 bytes) and the
+; three standard AES key sizes (128, 192 and 256 bits). It has the same call
+; interface as my C implementation.  It uses the Microsoft C AMD64 calling
+; conventions in which the three parameters are placed in  rcx, rdx and r8
+; respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
+;
+;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
+;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
+;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
+;                                            const aes_encrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
+;                                            const aes_decrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_encrypt_key(const unsigned char key[],
+;                           unsigned int len, const aes_decrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_decrypt_key(const unsigned char key[],
+;                           unsigned int len, const aes_decrypt_ctx cx[1]);
+;
+; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
+; either bits or bytes.
+;
+; Comment in/out the following lines to obtain the desired subroutines. These
+; selections MUST match those in the C header file aes.h
+
+; %define AES_128                 ; define if AES with 128 bit keys is needed
+; %define AES_192                 ; define if AES with 192 bit keys is needed
+%define AES_256                 ; define if AES with 256 bit keys is needed
+; %define AES_VAR                 ; define if a variable key size is needed
+%define ENCRYPTION              ; define if encryption is needed
+%define DECRYPTION              ; define if decryption is needed
+%define AES_REV_DKS             ; define if key decryption schedule is reversed
+%define LAST_ROUND_TABLES       ; define for the faster version using extra tables
+
+; The encryption key schedule has the following in memory layout where N is the
+; number of rounds (10, 12 or 14):
+;
+; lo: | input key (round 0)  |  ; each round is four 32-bit words
+;     | encryption round 1   |
+;     | encryption round 2   |
+;     ....
+;     | encryption round N-1 |
+; hi: | encryption round N   |
+;
+; The decryption key schedule is normally set up so that it has the same
+; layout as above by actually reversing the order of the encryption key
+; schedule in memory (this happens when AES_REV_DKS is set):
+;
+; lo: | decryption round 0   | =              | encryption round N   |
+;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
+;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
+;     ....                       ....
+;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
+; hi: | decryption round N   | =              | input key (round 0)  |
+;
+; with rounds except the first and last modified using inv_mix_column()
+; But if AES_REV_DKS is NOT set the order of keys is left as it is for
+; encryption so that it has to be accessed in reverse when used for
+; decryption (although the inverse mix column modifications are done)
+;
+; lo: | decryption round 0   | =              | input key (round 0)  |
+;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
+;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
+;     ....                       ....
+;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
+; hi: | decryption round N   | =              | encryption round N   |
+;
+; This layout is faster when the assembler key scheduling provided here
+; is used.
+;
+; The DLL interface must use the _stdcall convention in which the number
+; of bytes of parameter space is added after an @ to the sutine's name.
+; We must also remove our parameters from the stack before return (see
+; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
+
+;%define DLL_EXPORT
+
+; End of user defines
+
+%ifdef AES_VAR
+%ifndef AES_128
+%define AES_128
+%endif
+%ifndef AES_192
+%define AES_192
+%endif
+%ifndef AES_256
+%define AES_256
+%endif
+%endif
+
+%ifdef AES_VAR
+%define KS_LENGTH       60
+%elifdef AES_256
+%define KS_LENGTH       60
+%elifdef AES_192
+%define KS_LENGTH       52
+%else
+%define KS_LENGTH       44
+%endif
+
+%define     r0  rax
+%define     r1  rdx
+%define     r2  rcx
+%define     r3  rbx
+%define     r4  rsi
+%define     r5  rdi
+%define     r6  rbp
+%define     r7  rsp
+
+%define     raxd    eax
+%define     rdxd    edx
+%define     rcxd    ecx
+%define     rbxd    ebx
+%define     rsid    esi
+%define     rdid    edi
+%define     rbpd    ebp
+%define     rspd    esp
+
+%define     raxb    al
+%define     rdxb    dl
+%define     rcxb    cl
+%define     rbxb    bl
+%define     rsib    sil
+%define     rdib    dil
+%define     rbpb    bpl
+%define     rspb    spl
+
+%define     r0h ah
+%define     r1h dh
+%define     r2h ch
+%define     r3h bh
+
+%define     r0d eax
+%define     r1d edx
+%define     r2d ecx
+%define     r3d ebx
+
+; finite field multiplies by {02}, {04} and {08}
+
+%define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
+%define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
+%define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
+
+; finite field multiplies required in table generation
+
+%define f3(x)   (f2(x) ^ x)
+%define f9(x)   (f8(x) ^ x)
+%define fb(x)   (f8(x) ^ f2(x) ^ x)
+%define fd(x)   (f8(x) ^ f4(x) ^ x)
+%define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
+
+; macro for expanding S-box data
+
+%macro enc_vals 1
+    db  %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5)
+    db  %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76)
+    db  %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0)
+    db  %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0)
+    db  %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc)
+    db  %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15)
+    db  %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a)
+    db  %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75)
+    db  %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0)
+    db  %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84)
+    db  %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b)
+    db  %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf)
+    db  %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85)
+    db  %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8)
+    db  %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5)
+    db  %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2)
+    db  %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17)
+    db  %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73)
+    db  %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88)
+    db  %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb)
+    db  %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c)
+    db  %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79)
+    db  %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9)
+    db  %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08)
+    db  %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6)
+    db  %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a)
+    db  %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e)
+    db  %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e)
+    db  %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94)
+    db  %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf)
+    db  %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68)
+    db  %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16)
+%endmacro
+
+%macro dec_vals 1
+    db  %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38)
+    db  %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb)
+    db  %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87)
+    db  %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb)
+    db  %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d)
+    db  %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e)
+    db  %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2)
+    db  %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25)
+    db  %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16)
+    db  %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92)
+    db  %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda)
+    db  %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84)
+    db  %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a)
+    db  %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06)
+    db  %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02)
+    db  %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b)
+    db  %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea)
+    db  %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73)
+    db  %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85)
+    db  %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e)
+    db  %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89)
+    db  %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b)
+    db  %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20)
+    db  %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4)
+    db  %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31)
+    db  %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f)
+    db  %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d)
+    db  %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef)
+    db  %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0)
+    db  %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61)
+    db  %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26)
+    db  %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d)
+%endmacro
+
+%define u8(x)   f2(x), x, x, f3(x), f2(x), x, x, f3(x)
+%define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
+%define w8(x)   x, 0, 0, 0, x, 0, 0, 0
+
+%define tptr    rbp     ; table pointer
+%define kptr    r8      ; key schedule pointer
+%define fofs    128     ; adjust offset in key schedule to keep |disp| < 128
+%define fk_ref(x,y) [kptr-16*x+fofs+4*y]
+%ifdef  AES_REV_DKS
+%define rofs    128
+%define ik_ref(x,y) [kptr-16*x+rofs+4*y]
+%else
+%define rofs    -128
+%define ik_ref(x,y) [kptr+16*x+rofs+4*y]
+%endif
+
+%define tab_0(x)   [tptr+8*x]
+%define tab_1(x)   [tptr+8*x+3]
+%define tab_2(x)   [tptr+8*x+2]
+%define tab_3(x)   [tptr+8*x+1]
+%define tab_f(x)   byte [tptr+8*x+1]
+%define tab_i(x)   byte [tptr+8*x+7]
+%define t_ref(x,r) tab_ %+ x(r)
+
+%macro ff_rnd 5                 ; normal forward round
+    mov     %1d, fk_ref(%5,0)
+    mov     %2d, fk_ref(%5,1)
+    mov     %3d, fk_ref(%5,2)
+    mov     %4d, fk_ref(%5,3)
+
+    movzx   esi, al
+    movzx   edi, ah
+    shr     eax, 16
+    xor     %1d, t_ref(0,rsi)
+    xor     %4d, t_ref(1,rdi)
+    movzx   esi, al
+    movzx   edi, ah
+    xor     %3d, t_ref(2,rsi)
+    xor     %2d, t_ref(3,rdi)
+
+    movzx   esi, bl
+    movzx   edi, bh
+    shr     ebx, 16
+    xor     %2d, t_ref(0,rsi)
+    xor     %1d, t_ref(1,rdi)
+    movzx   esi, bl
+    movzx   edi, bh
+    xor     %4d, t_ref(2,rsi)
+    xor     %3d, t_ref(3,rdi)
+
+    movzx   esi, cl
+    movzx   edi, ch
+    shr     ecx, 16
+    xor     %3d, t_ref(0,rsi)
+    xor     %2d, t_ref(1,rdi)
+    movzx   esi, cl
+    movzx   edi, ch
+    xor     %1d, t_ref(2,rsi)
+    xor     %4d, t_ref(3,rdi)
+
+    movzx   esi, dl
+    movzx   edi, dh
+    shr     edx, 16
+    xor     %4d, t_ref(0,rsi)
+    xor     %3d, t_ref(1,rdi)
+    movzx   esi, dl
+    movzx   edi, dh
+    xor     %2d, t_ref(2,rsi)
+    xor     %1d, t_ref(3,rdi)
+
+    mov     eax,%1d
+    mov     ebx,%2d
+    mov     ecx,%3d
+    mov     edx,%4d
+%endmacro
+
+%ifdef LAST_ROUND_TABLES
+
+%macro fl_rnd 5                 ; last forward round
+    add     tptr, 2048
+    mov     %1d, fk_ref(%5,0)
+    mov     %2d, fk_ref(%5,1)
+    mov     %3d, fk_ref(%5,2)
+    mov     %4d, fk_ref(%5,3)
+
+    movzx   esi, al
+    movzx   edi, ah
+    shr     eax, 16
+    xor     %1d, t_ref(0,rsi)
+    xor     %4d, t_ref(1,rdi)
+    movzx   esi, al
+    movzx   edi, ah
+    xor     %3d, t_ref(2,rsi)
+    xor     %2d, t_ref(3,rdi)
+
+    movzx   esi, bl
+    movzx   edi, bh
+    shr     ebx, 16
+    xor     %2d, t_ref(0,rsi)
+    xor     %1d, t_ref(1,rdi)
+    movzx   esi, bl
+    movzx   edi, bh
+    xor     %4d, t_ref(2,rsi)
+    xor     %3d, t_ref(3,rdi)
+
+    movzx   esi, cl
+    movzx   edi, ch
+    shr     ecx, 16
+    xor     %3d, t_ref(0,rsi)
+    xor     %2d, t_ref(1,rdi)
+    movzx   esi, cl
+    movzx   edi, ch
+    xor     %1d, t_ref(2,rsi)
+    xor     %4d, t_ref(3,rdi)
+
+    movzx   esi, dl
+    movzx   edi, dh
+    shr     edx, 16
+    xor     %4d, t_ref(0,rsi)
+    xor     %3d, t_ref(1,rdi)
+    movzx   esi, dl
+    movzx   edi, dh
+    xor     %2d, t_ref(2,rsi)
+    xor     %1d, t_ref(3,rdi)
+%endmacro
+
+%else
+
+%macro fl_rnd 5                 ; last forward round
+    mov     %1d, fk_ref(%5,0)
+    mov     %2d, fk_ref(%5,1)
+    mov     %3d, fk_ref(%5,2)
+    mov     %4d, fk_ref(%5,3)
+
+    movzx   esi, al
+    movzx   edi, ah
+    shr     eax, 16
+    movzx   esi, t_ref(f,rsi)
+    movzx   edi, t_ref(f,rdi)
+    xor     %1d, esi
+    rol     edi, 8
+    xor     %4d, edi
+    movzx   esi, al
+    movzx   edi, ah
+    movzx   esi, t_ref(f,rsi)
+    movzx   edi, t_ref(f,rdi)
+    rol     esi, 16
+    rol     edi, 24
+    xor     %3d, esi
+    xor     %2d, edi
+
+    movzx   esi, bl
+    movzx   edi, bh
+    shr     ebx, 16
+    movzx   esi, t_ref(f,rsi)
+    movzx   edi, t_ref(f,rdi)
+    xor     %2d, esi
+    rol     edi, 8
+    xor     %1d, edi
+    movzx   esi, bl
+    movzx   edi, bh
+    movzx   esi, t_ref(f,rsi)
+    movzx   edi, t_ref(f,rdi)
+    rol     esi, 16
+    rol     edi, 24
+    xor     %4d, esi
+    xor     %3d, edi
+
+    movzx   esi, cl
+    movzx   edi, ch
+    movzx   esi, t_ref(f,rsi)
+    movzx   edi, t_ref(f,rdi)
+    shr     ecx, 16
+    xor     %3d, esi
+    rol     edi, 8
+    xor     %2d, edi
+    movzx   esi, cl
+    movzx   edi, ch
+    movzx   esi, t_ref(f,rsi)
+    movzx   edi, t_ref(f,rdi)
+    rol     esi, 16
+    rol     edi, 24
+    xor     %1d, esi
+    xor     %4d, edi
+
+    movzx   esi, dl
+    movzx   edi, dh
+    movzx   esi, t_ref(f,rsi)
+    movzx   edi, t_ref(f,rdi)
+    shr     edx, 16
+    xor     %4d, esi
+    rol     edi, 8
+    xor     %3d, edi
+    movzx   esi, dl
+    movzx   edi, dh
+    movzx   esi, t_ref(f,rsi)
+    movzx   edi, t_ref(f,rdi)
+    rol     esi, 16
+    rol     edi, 24
+    xor     %2d, esi
+    xor     %1d, edi
+%endmacro
+
+%endif
+
+%macro ii_rnd 5                 ; normal inverse round
+    mov     %1d, ik_ref(%5,0)
+    mov     %2d, ik_ref(%5,1)
+    mov     %3d, ik_ref(%5,2)
+    mov     %4d, ik_ref(%5,3)
+
+    movzx   esi, al
+    movzx   edi, ah
+    shr     eax, 16
+    xor     %1d, t_ref(0,rsi)
+    xor     %2d, t_ref(1,rdi)
+    movzx   esi, al
+    movzx   edi, ah
+    xor     %3d, t_ref(2,rsi)
+    xor     %4d, t_ref(3,rdi)
+
+    movzx   esi, bl
+    movzx   edi, bh
+    shr     ebx, 16
+    xor     %2d, t_ref(0,rsi)
+    xor     %3d, t_ref(1,rdi)
+    movzx   esi, bl
+    movzx   edi, bh
+    xor     %4d, t_ref(2,rsi)
+    xor     %1d, t_ref(3,rdi)
+
+    movzx   esi, cl
+    movzx   edi, ch
+    shr     ecx, 16
+    xor     %3d, t_ref(0,rsi)
+    xor     %4d, t_ref(1,rdi)
+    movzx   esi, cl
+    movzx   edi, ch
+    xor     %1d, t_ref(2,rsi)
+    xor     %2d, t_ref(3,rdi)
+
+    movzx   esi, dl
+    movzx   edi, dh
+    shr     edx, 16
+    xor     %4d, t_ref(0,rsi)
+    xor     %1d, t_ref(1,rdi)
+    movzx   esi, dl
+    movzx   edi, dh
+    xor     %2d, t_ref(2,rsi)
+    xor     %3d, t_ref(3,rdi)
+
+    mov     eax,%1d
+    mov     ebx,%2d
+    mov     ecx,%3d
+    mov     edx,%4d
+%endmacro
+
+%ifdef LAST_ROUND_TABLES
+
+%macro il_rnd 5                 ; last inverse round
+    add     tptr, 2048
+    mov     %1d, ik_ref(%5,0)
+    mov     %2d, ik_ref(%5,1)
+    mov     %3d, ik_ref(%5,2)
+    mov     %4d, ik_ref(%5,3)
+
+    movzx   esi, al
+    movzx   edi, ah
+    shr     eax, 16
+    xor     %1d, t_ref(0,rsi)
+    xor     %2d, t_ref(1,rdi)
+    movzx   esi, al
+    movzx   edi, ah
+    xor     %3d, t_ref(2,rsi)
+    xor     %4d, t_ref(3,rdi)
+
+    movzx   esi, bl
+    movzx   edi, bh
+    shr     ebx, 16
+    xor     %2d, t_ref(0,rsi)
+    xor     %3d, t_ref(1,rdi)
+    movzx   esi, bl
+    movzx   edi, bh
+    xor     %4d, t_ref(2,rsi)
+    xor     %1d, t_ref(3,rdi)
+
+    movzx   esi, cl
+    movzx   edi, ch
+    shr     ecx, 16
+    xor     %3d, t_ref(0,rsi)
+    xor     %4d, t_ref(1,rdi)
+    movzx   esi, cl
+    movzx   edi, ch
+    xor     %1d, t_ref(2,rsi)
+    xor     %2d, t_ref(3,rdi)
+
+    movzx   esi, dl
+    movzx   edi, dh
+    shr     edx, 16
+    xor     %4d, t_ref(0,rsi)
+    xor     %1d, t_ref(1,rdi)
+    movzx   esi, dl
+    movzx   edi, dh
+    xor     %2d, t_ref(2,rsi)
+    xor     %3d, t_ref(3,rdi)
+%endmacro
+
+%else
+
+%macro il_rnd 5                 ; last inverse round
+    mov     %1d, ik_ref(%5,0)
+    mov     %2d, ik_ref(%5,1)
+    mov     %3d, ik_ref(%5,2)
+    mov     %4d, ik_ref(%5,3)
+
+    movzx   esi, al
+    movzx   edi, ah
+    movzx   esi, t_ref(i,rsi)
+    movzx   edi, t_ref(i,rdi)
+    shr     eax, 16
+    xor     %1d, esi
+    rol     edi, 8
+    xor     %2d, edi
+    movzx   esi, al
+    movzx   edi, ah
+    movzx   esi, t_ref(i,rsi)
+    movzx   edi, t_ref(i,rdi)
+    rol     esi, 16
+    rol     edi, 24
+    xor     %3d, esi
+    xor     %4d, edi
+
+    movzx   esi, bl
+    movzx   edi, bh
+    movzx   esi, t_ref(i,rsi)
+    movzx   edi, t_ref(i,rdi)
+    shr     ebx, 16
+    xor     %2d, esi
+    rol     edi, 8
+    xor     %3d, edi
+    movzx   esi, bl
+    movzx   edi, bh
+    movzx   esi, t_ref(i,rsi)
+    movzx   edi, t_ref(i,rdi)
+    rol     esi, 16
+    rol     edi, 24
+    xor     %4d, esi
+    xor     %1d, edi
+
+    movzx   esi, cl
+    movzx   edi, ch
+    movzx   esi, t_ref(i,rsi)
+    movzx   edi, t_ref(i,rdi)
+    shr     ecx, 16
+    xor     %3d, esi
+    rol     edi, 8
+    xor     %4d, edi
+    movzx   esi, cl
+    movzx   edi, ch
+    movzx   esi, t_ref(i,rsi)
+    movzx   edi, t_ref(i,rdi)
+    rol     esi, 16
+    rol     edi, 24
+    xor     %1d, esi
+    xor     %2d, edi
+
+    movzx   esi, dl
+    movzx   edi, dh
+    movzx   esi, t_ref(i,rsi)
+    movzx   edi, t_ref(i,rdi)
+    shr     edx, 16
+    xor     %4d, esi
+    rol     edi, 8
+    xor     %1d, edi
+    movzx   esi, dl
+    movzx   edi, dh
+    movzx   esi, t_ref(i,rsi)
+    movzx   edi, t_ref(i,rdi)
+    rol     esi, 16
+    rol     edi, 24
+    xor     %2d, esi
+    xor     %3d, edi
+%endmacro
+
+%endif
+
+%ifdef ENCRYPTION
+
+    global  aes_encrypt
+%ifdef DLL_EXPORT
+    export  aes_encrypt
+%endif
+
+    section .data align=64
+    align   64
+enc_tab:
+    enc_vals u8
+%ifdef LAST_ROUND_TABLES
+    enc_vals w8
+%endif
+
+    section .text align=16
+    align   16
+
+%ifdef _SEH_
+proc_frame aes_encrypt
+	alloc_stack	7*8			; 7 to align stack to 16 bytes
+	save_reg	rsi,4*8
+	save_reg	rdi,5*8
+	save_reg	rbx,1*8
+	save_reg	rbp,2*8
+	save_reg	r12,3*8
+end_prologue
+    mov     rdi, rcx        ; input pointer
+    mov     [rsp+0*8], rdx  ; output pointer
+%else
+	aes_encrypt:
+	%ifdef __GNUC__
+		sub     rsp, 4*8        ; gnu/linux binary interface
+		mov     [rsp+0*8], rsi  ; output pointer
+		mov     r8, rdx         ; context
+	%else
+		sub     rsp, 6*8        ; windows binary interface
+		mov     [rsp+4*8], rsi
+		mov     [rsp+5*8], rdi
+		mov     rdi, rcx        ; input pointer
+		mov     [rsp+0*8], rdx  ; output pointer
+	%endif
+		mov     [rsp+1*8], rbx  ; input pointer in rdi
+		mov     [rsp+2*8], rbp  ; output pointer in [rsp]
+		mov     [rsp+3*8], r12  ; context in r8
+%endif
+
+    movzx   esi, byte [kptr+4*KS_LENGTH]
+    lea     tptr, [rel enc_tab]
+    sub     kptr, fofs
+
+    mov     eax, [rdi+0*4]
+    mov     ebx, [rdi+1*4]
+    mov     ecx, [rdi+2*4]
+    mov     edx, [rdi+3*4]
+
+    xor     eax, [kptr+fofs]
+    xor     ebx, [kptr+fofs+4]
+    xor     ecx, [kptr+fofs+8]
+    xor     edx, [kptr+fofs+12]
+
+    lea     kptr,[kptr+rsi]
+    cmp     esi, 10*16
+    je      .3
+    cmp     esi, 12*16
+    je      .2
+    cmp     esi, 14*16
+    je      .1
+    mov     rax, -1
+    jmp     .4
+
+.1: ff_rnd  r9, r10, r11, r12, 13
+    ff_rnd  r9, r10, r11, r12, 12
+.2: ff_rnd  r9, r10, r11, r12, 11
+    ff_rnd  r9, r10, r11, r12, 10
+.3: ff_rnd  r9, r10, r11, r12, 9
+    ff_rnd  r9, r10, r11, r12, 8
+    ff_rnd  r9, r10, r11, r12, 7
+    ff_rnd  r9, r10, r11, r12, 6
+    ff_rnd  r9, r10, r11, r12, 5
+    ff_rnd  r9, r10, r11, r12, 4
+    ff_rnd  r9, r10, r11, r12, 3
+    ff_rnd  r9, r10, r11, r12, 2
+    ff_rnd  r9, r10, r11, r12, 1
+    fl_rnd  r9, r10, r11, r12, 0
+
+    mov     rbx, [rsp]
+    mov     [rbx], r9d
+    mov     [rbx+4], r10d
+    mov     [rbx+8], r11d
+    mov     [rbx+12], r12d
+    xor     rax, rax
+.4:
+    mov     rbx, [rsp+1*8]
+    mov     rbp, [rsp+2*8]
+    mov     r12, [rsp+3*8]
+%ifdef __GNUC__
+    add     rsp, 4*8
+    ret
+%else
+		mov     rsi, [rsp+4*8]
+		mov     rdi, [rsp+5*8]
+	%ifdef _SEH_
+		add     rsp, 7*8
+		ret
+	endproc_frame
+	%else
+		add     rsp, 6*8
+		ret
+	%endif
+%endif
+
+%endif
+
+%ifdef DECRYPTION
+
+    global  aes_decrypt
+%ifdef DLL_EXPORT
+    export  aes_decrypt
+%endif
+
+    section .data
+    align   64
+dec_tab:
+    dec_vals v8
+%ifdef LAST_ROUND_TABLES
+    dec_vals w8
+%endif
+
+    section .text
+    align   16
+
+%ifdef _SEH_
+proc_frame aes_decrypt
+	alloc_stack	7*8			; 7 to align stack to 16 bytes
+	save_reg	rsi,4*8
+	save_reg	rdi,5*8
+	save_reg	rbx,1*8
+	save_reg	rbp,2*8
+	save_reg	r12,3*8
+end_prologue
+    mov     rdi, rcx        ; input pointer
+    mov     [rsp+0*8], rdx  ; output pointer
+%else
+	aes_decrypt:
+	%ifdef __GNUC__
+		sub     rsp, 4*8        ; gnu/linux binary interface
+		mov     [rsp+0*8], rsi  ; output pointer
+		mov     r8, rdx         ; context
+	%else
+		sub     rsp, 6*8        ; windows binary interface
+		mov     [rsp+4*8], rsi
+		mov     [rsp+5*8], rdi
+		mov     rdi, rcx        ; input pointer
+		mov     [rsp+0*8], rdx  ; output pointer
+	%endif
+		mov     [rsp+1*8], rbx  ; input pointer in rdi
+		mov     [rsp+2*8], rbp  ; output pointer in [rsp]
+		mov     [rsp+3*8], r12  ; context in r8
+%endif
+
+    movzx   esi,byte[kptr+4*KS_LENGTH]
+    lea     tptr, [rel dec_tab]
+    sub     kptr, rofs
+
+    mov     eax, [rdi+0*4]
+    mov     ebx, [rdi+1*4]
+    mov     ecx, [rdi+2*4]
+    mov     edx, [rdi+3*4]
+
+%ifdef      AES_REV_DKS
+    mov     rdi, kptr
+    lea     kptr,[kptr+rsi]
+%else
+    lea     rdi,[kptr+rsi]
+%endif
+
+    xor     eax, [rdi+rofs]
+    xor     ebx, [rdi+rofs+4]
+    xor     ecx, [rdi+rofs+8]
+    xor     edx, [rdi+rofs+12]
+
+    cmp     esi, 10*16
+    je      .3
+    cmp     esi, 12*16
+    je      .2
+    cmp     esi, 14*16
+    je      .1
+    mov     rax, -1
+    jmp     .4
+
+.1: ii_rnd  r9, r10, r11, r12, 13
+    ii_rnd  r9, r10, r11, r12, 12
+.2: ii_rnd  r9, r10, r11, r12, 11
+    ii_rnd  r9, r10, r11, r12, 10
+.3: ii_rnd  r9, r10, r11, r12, 9
+    ii_rnd  r9, r10, r11, r12, 8
+    ii_rnd  r9, r10, r11, r12, 7
+    ii_rnd  r9, r10, r11, r12, 6
+    ii_rnd  r9, r10, r11, r12, 5
+    ii_rnd  r9, r10, r11, r12, 4
+    ii_rnd  r9, r10, r11, r12, 3
+    ii_rnd  r9, r10, r11, r12, 2
+    ii_rnd  r9, r10, r11, r12, 1
+    il_rnd  r9, r10, r11, r12, 0
+
+    mov     rbx, [rsp]
+    mov     [rbx], r9d
+    mov     [rbx+4], r10d
+    mov     [rbx+8], r11d
+    mov     [rbx+12], r12d
+    xor     rax, rax
+.4: mov     rbx, [rsp+1*8]
+    mov     rbp, [rsp+2*8]
+    mov     r12, [rsp+3*8]
+%ifdef __GNUC__
+    add     rsp, 4*8
+    ret
+%else
+		mov     rsi, [rsp+4*8]
+		mov     rdi, [rsp+5*8]
+	%ifdef _SEH_
+		add     rsp, 7*8
+		ret
+	endproc_frame
+	%else
+		add     rsp, 6*8
+		ret
+	%endif
+%endif
+
+%endif
diff --git a/src/Crypto/Aes_x86.asm b/src/Crypto/Aes_x86.asm
index 239da3e3..3825deee 100644
--- a/src/Crypto/Aes_x86.asm
+++ b/src/Crypto/Aes_x86.asm
@@ -1,646 +1,646 @@
-
-; ---------------------------------------------------------------------------
-; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
-; 
-; LICENSE TERMS
-; 
-; The free distribution and use of this software is allowed (with or without
-; changes) provided that:
-; 
-;  1. source code distributions include the above copyright notice, this
-;     list of conditions and the following disclaimer;
-; 
-;  2. binary distributions include the above copyright notice, this list
-;     of conditions and the following disclaimer in their documentation;
-; 
-;  3. the name of the copyright holder is not used to endorse products
-;     built using this software without specific written permission.
-; 
-; DISCLAIMER
-; 
-; This software is provided 'as is' with no explicit or implied warranties
-; in respect of its properties, including, but not limited to, correctness
-; and/or fitness for purpose.
-; ---------------------------------------------------------------------------
-; Issue 20/12/2007
-;
-; This code requires ASM_X86_V1C to be set in aesopt.h. It requires the C files
-; aeskey.c and aestab.c for support.
-
-;
-; Adapted for TrueCrypt:
-; - Compatibility with NASM and GCC
-;
-
-; An AES implementation for x86 processors using the YASM (or NASM) assembler.
-; This is an assembler implementation that covers encryption and decryption
-; only and is intended as a replacement of the C file aescrypt.c. It hence
-; requires the file aeskey.c for keying and aestab.c for the AES tables. It
-; employs full tables rather than compressed tables.
-
-; This code provides the standard AES block size (128 bits, 16 bytes) and the
-; three standard AES key sizes (128, 192 and 256 bits). It has the same call
-; interface as my C implementation. The ebx, esi, edi and ebp registers are
-; preserved across calls but eax, ecx and edx and the artihmetic status flags
-; are not.  It is also important that the defines below match those used in the
-; C code.  This code uses the VC++ register saving conentions; if it is used
-; with another compiler, conventions for using and saving registers may need to
-; be checked (and calling conventions).  The YASM command line for the VC++
-; custom build step is:
-;
-;    yasm -Xvc -f win32 -o "$(TargetDir)\$(InputName).obj" "$(InputPath)"
-;
-;  The calling intefaces are:
-;
-;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
-;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
-;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
-;                                            const aes_encrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
-;                                            const aes_decrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_encrypt_key(const unsigned char key[],
-;                           unsigned int len, const aes_decrypt_ctx cx[1]);
-;
-;     AES_RETURN aes_decrypt_key(const unsigned char key[],
-;                           unsigned int len, const aes_decrypt_ctx cx[1]);
-;
-; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
-; either bits or bytes.
-;
-; Comment in/out the following lines to obtain the desired subroutines. These
-; selections MUST match those in the C header file aes.h
-
-; %define AES_128                 ; define if AES with 128 bit keys is needed
-; %define AES_192                 ; define if AES with 192 bit keys is needed
-%define AES_256                 ; define if AES with 256 bit keys is needed
-; %define AES_VAR                 ; define if a variable key size is needed
-%define ENCRYPTION              ; define if encryption is needed
-%define DECRYPTION              ; define if decryption is needed
-%define AES_REV_DKS             ; define if key decryption schedule is reversed
-%define LAST_ROUND_TABLES       ; define if tables are to be used for last round
-
-; offsets to parameters
-
-in_blk  equ     4   ; input byte array address parameter
-out_blk equ     8   ; output byte array address parameter
-ctx     equ    12   ; AES context structure
-stk_spc equ    20   ; stack space
-%define parms  12   ; parameter space on stack
-
-; The encryption key schedule has the following in memory layout where N is the
-; number of rounds (10, 12 or 14):
-;
-; lo: | input key (round 0)  |  ; each round is four 32-bit words
-;     | encryption round 1   |
-;     | encryption round 2   |
-;     ....
-;     | encryption round N-1 |
-; hi: | encryption round N   |
-;
-; The decryption key schedule is normally set up so that it has the same
-; layout as above by actually reversing the order of the encryption key
-; schedule in memory (this happens when AES_REV_DKS is set):
-;
-; lo: | decryption round 0   | =              | encryption round N   |
-;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
-;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
-;     ....                       ....
-;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
-; hi: | decryption round N   | =              | input key (round 0)  |
-;
-; with rounds except the first and last modified using inv_mix_column()
-; But if AES_REV_DKS is NOT set the order of keys is left as it is for
-; encryption so that it has to be accessed in reverse when used for
-; decryption (although the inverse mix column modifications are done)
-;
-; lo: | decryption round 0   | =              | input key (round 0)  |
-;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
-;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
-;     ....                       ....
-;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
-; hi: | decryption round N   | =              | encryption round N   |
-;
-; This layout is faster when the assembler key scheduling provided here
-; is used.
-;
-; The DLL interface must use the _stdcall convention in which the number
-; of bytes of parameter space is added after an @ to the sutine's name.
-; We must also remove our parameters from the stack before return (see
-; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
-
-;%define DLL_EXPORT
-
-; End of user defines
-
-%ifdef AES_VAR
-%ifndef AES_128
-%define AES_128
-%endif
-%ifndef AES_192
-%define AES_192
-%endif
-%ifndef AES_256
-%define AES_256
-%endif
-%endif
-
-%ifdef AES_VAR
-%define KS_LENGTH       60
-%elifdef AES_256
-%define KS_LENGTH       60
-%elifdef AES_192
-%define KS_LENGTH       52
-%else
-%define KS_LENGTH       44
-%endif
-
-; These macros implement stack based local variables
-
-%macro  save 2
-    mov     [esp+4*%1],%2
-%endmacro
-
-%macro  restore 2
-    mov     %1,[esp+4*%2]
-%endmacro
-
-; the DLL has to implement the _stdcall calling interface on return
-; In this case we have to take our parameters (3 4-byte pointers)
-; off the stack
-
-%macro  do_name 1-2 parms
-%ifndef DLL_EXPORT
-    align 32
-    global  %1
-%1:
-%else
-    align 32
-    global  %1@%2
-    export  _%1@%2
-%1@%2:
-%endif
-%endmacro
-
-%macro  do_call 1-2 parms
-%ifndef DLL_EXPORT
-    call    %1
-    add     esp,%2
-%else
-    call    %1@%2
-%endif
-%endmacro
-
-%macro  do_exit  0-1 parms
-%ifdef DLL_EXPORT
-    ret %1
-%else
-    ret
-%endif
-%endmacro
-
-%ifdef  ENCRYPTION
-
-    extern  t_fn
-
-%define etab_0(x)   [t_fn+4*x]
-%define etab_1(x)   [t_fn+1024+4*x]
-%define etab_2(x)   [t_fn+2048+4*x]
-%define etab_3(x)   [t_fn+3072+4*x]
-
-%ifdef LAST_ROUND_TABLES
-
-    extern  t_fl
-
-%define eltab_0(x)  [t_fl+4*x]
-%define eltab_1(x)  [t_fl+1024+4*x]
-%define eltab_2(x)  [t_fl+2048+4*x]
-%define eltab_3(x)  [t_fl+3072+4*x]
-
-%else
-
-%define etab_b(x)   byte [t_fn+3072+4*x]
-
-%endif
-
-; ROUND FUNCTION.  Build column[2] on ESI and column[3] on EDI that have the
-; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX.
-;
-; Input:
-;
-;   EAX     column[0]
-;   EBX     column[1]
-;   ECX     column[2]
-;   EDX     column[3]
-;   ESI     column key[round][2]
-;   EDI     column key[round][3]
-;   EBP     scratch
-;
-; Output:
-;
-;   EBP     column[0]   unkeyed
-;   EBX     column[1]   unkeyed
-;   ESI     column[2]   keyed
-;   EDI     column[3]   keyed
-;   EAX     scratch
-;   ECX     scratch
-;   EDX     scratch
-
-%macro rnd_fun 2
-
-    rol     ebx,16
-    %1      esi, cl, 0, ebp
-    %1      esi, dh, 1, ebp
-    %1      esi, bh, 3, ebp
-    %1      edi, dl, 0, ebp
-    %1      edi, ah, 1, ebp
-    %1      edi, bl, 2, ebp
-    %2      ebp, al, 0, ebp
-    shr     ebx,16
-    and     eax,0xffff0000
-    or      eax,ebx
-    shr     edx,16
-    %1      ebp, ah, 1, ebx
-    %1      ebp, dh, 3, ebx
-    %2      ebx, dl, 2, ebx
-    %1      ebx, ch, 1, edx
-    %1      ebx, al, 0, edx
-    shr     eax,16
-    shr     ecx,16
-    %1      ebp, cl, 2, edx
-    %1      edi, ch, 3, edx
-    %1      esi, al, 2, edx
-    %1      ebx, ah, 3, edx
-
-%endmacro
-
-; Basic MOV and XOR Operations for normal rounds
-
-%macro  nr_xor  4
-    movzx   %4,%2
-    xor     %1,etab_%3(%4)
-%endmacro
-
-%macro  nr_mov  4
-    movzx   %4,%2
-    mov     %1,etab_%3(%4)
-%endmacro
-
-; Basic MOV and XOR Operations for last round
-
-%ifdef LAST_ROUND_TABLES
-
-    %macro  lr_xor  4
-        movzx   %4,%2
-        xor     %1,eltab_%3(%4)
-    %endmacro
-
-    %macro  lr_mov  4
-        movzx   %4,%2
-        mov     %1,eltab_%3(%4)
-    %endmacro
-
-%else
-
-    %macro  lr_xor  4
-        movzx   %4,%2
-        movzx   %4,etab_b(%4)
-    %if %3 != 0
-        shl     %4,8*%3
-    %endif
-        xor     %1,%4
-    %endmacro
-
-    %macro  lr_mov  4
-        movzx   %4,%2
-        movzx   %1,etab_b(%4)
-    %if %3 != 0
-        shl     %1,8*%3
-    %endif
-    %endmacro
-
-%endif
-
-%macro enc_round 0
-
-    add     ebp,16
-    save    0,ebp
-    mov     esi,[ebp+8]
-    mov     edi,[ebp+12]
-
-    rnd_fun nr_xor, nr_mov
-
-    mov     eax,ebp
-    mov     ecx,esi
-    mov     edx,edi
-    restore ebp,0
-    xor     eax,[ebp]
-    xor     ebx,[ebp+4]
-
-%endmacro
-
-%macro enc_last_round 0
-
-    add     ebp,16
-    save    0,ebp
-    mov     esi,[ebp+8]
-    mov     edi,[ebp+12]
-
-    rnd_fun lr_xor, lr_mov
-
-    mov     eax,ebp
-    restore ebp,0
-    xor     eax,[ebp]
-    xor     ebx,[ebp+4]
-
-%endmacro
-
-    section .text align=32
-
-; AES Encryption Subroutine
-
-    do_name aes_encrypt
-
-    sub     esp,stk_spc
-    mov     [esp+16],ebp
-    mov     [esp+12],ebx
-    mov     [esp+ 8],esi
-    mov     [esp+ 4],edi
-
-    mov     esi,[esp+in_blk+stk_spc] ; input pointer
-    mov     eax,[esi   ]
-    mov     ebx,[esi+ 4]
-    mov     ecx,[esi+ 8]
-    mov     edx,[esi+12]
-
-    mov     ebp,[esp+ctx+stk_spc]    ; key pointer
-    movzx   edi,byte [ebp+4*KS_LENGTH]
-    xor     eax,[ebp   ]
-    xor     ebx,[ebp+ 4]
-    xor     ecx,[ebp+ 8]
-    xor     edx,[ebp+12]
-
-; determine the number of rounds
-
-    cmp     edi,10*16
-    je      .3
-    cmp     edi,12*16
-    je      .2
-    cmp     edi,14*16
-    je      .1
-    mov     eax,-1
-    jmp     .5
-
-.1: enc_round
-    enc_round
-.2: enc_round
-    enc_round
-.3: enc_round
-    enc_round
-    enc_round
-    enc_round
-    enc_round
-    enc_round
-    enc_round
-    enc_round
-    enc_round
-    enc_last_round
-
-    mov     edx,[esp+out_blk+stk_spc]
-    mov     [edx],eax
-    mov     [edx+4],ebx
-    mov     [edx+8],esi
-    mov     [edx+12],edi
-    xor     eax,eax
-
-.5: mov     ebp,[esp+16]
-    mov     ebx,[esp+12]
-    mov     esi,[esp+ 8]
-    mov     edi,[esp+ 4]
-    add     esp,stk_spc
-    do_exit
-
-%endif
-
-%ifdef  DECRYPTION
-
-    extern  t_in
-
-%define dtab_0(x)   [t_in+4*x]
-%define dtab_1(x)   [t_in+1024+4*x]
-%define dtab_2(x)   [t_in+2048+4*x]
-%define dtab_3(x)   [t_in+3072+4*x]
-
-%ifdef LAST_ROUND_TABLES
-
-    extern  t_il
-
-%define dltab_0(x)  [t_il+4*x]
-%define dltab_1(x)  [t_il+1024+4*x]
-%define dltab_2(x)  [t_il+2048+4*x]
-%define dltab_3(x)  [t_il+3072+4*x]
-
-%else
-
-    extern  _t_ibox
-
-%define dtab_x(x)   byte [_t_ibox+x]
-
-%endif
-
-%macro irn_fun 2
-
-    rol eax,16
-    %1      esi, cl, 0, ebp
-    %1      esi, bh, 1, ebp
-    %1      esi, al, 2, ebp
-    %1      edi, dl, 0, ebp
-    %1      edi, ch, 1, ebp
-    %1      edi, ah, 3, ebp
-    %2      ebp, bl, 0, ebp
-    shr     eax,16
-    and     ebx,0xffff0000
-    or      ebx,eax
-    shr     ecx,16
-    %1      ebp, bh, 1, eax
-    %1      ebp, ch, 3, eax
-    %2      eax, cl, 2, ecx
-    %1      eax, bl, 0, ecx
-    %1      eax, dh, 1, ecx
-    shr     ebx,16
-    shr     edx,16
-    %1      esi, dh, 3, ecx
-    %1      ebp, dl, 2, ecx
-    %1      eax, bh, 3, ecx
-    %1      edi, bl, 2, ecx
-
-%endmacro
-
-; Basic MOV and XOR Operations for normal rounds
-
-%macro  ni_xor  4
-    movzx   %4,%2
-    xor     %1,dtab_%3(%4)
-%endmacro
-
-%macro  ni_mov  4
-    movzx   %4,%2
-    mov     %1,dtab_%3(%4)
-%endmacro
-
-; Basic MOV and XOR Operations for last round
-
-%ifdef LAST_ROUND_TABLES
-
-%macro  li_xor  4
-    movzx   %4,%2
-    xor     %1,dltab_%3(%4)
-%endmacro
-
-%macro  li_mov  4
-    movzx   %4,%2
-    mov     %1,dltab_%3(%4)
-%endmacro
-
-%else
-
-    %macro  li_xor  4
-        movzx   %4,%2
-        movzx   %4,dtab_x(%4)
-    %if %3 != 0
-        shl     %4,8*%3
-    %endif
-        xor     %1,%4
-    %endmacro
-
-    %macro  li_mov  4
-        movzx   %4,%2
-        movzx   %1,dtab_x(%4)
-    %if %3 != 0
-        shl     %1,8*%3
-    %endif
-    %endmacro
-
-%endif
-
-%macro dec_round 0
-
-%ifdef AES_REV_DKS
-    add     ebp,16
-%else
-    sub     ebp,16
-%endif
-    save    0,ebp
-    mov     esi,[ebp+8]
-    mov     edi,[ebp+12]
-
-    irn_fun ni_xor, ni_mov
-
-    mov     ebx,ebp
-    mov     ecx,esi
-    mov     edx,edi
-    restore ebp,0
-    xor     eax,[ebp]
-    xor     ebx,[ebp+4]
-
-%endmacro
-
-%macro dec_last_round 0
-
-%ifdef AES_REV_DKS
-    add     ebp,16
-%else
-    sub     ebp,16
-%endif
-    save    0,ebp
-    mov     esi,[ebp+8]
-    mov     edi,[ebp+12]
-
-    irn_fun li_xor, li_mov
-
-    mov     ebx,ebp
-    restore ebp,0
-    xor     eax,[ebp]
-    xor     ebx,[ebp+4]
-
-%endmacro
-
-    section .text
-
-; AES Decryption Subroutine
-
-    do_name aes_decrypt
-
-    sub     esp,stk_spc
-    mov     [esp+16],ebp
-    mov     [esp+12],ebx
-    mov     [esp+ 8],esi
-    mov     [esp+ 4],edi
-
-; input four columns and xor in first round key
-
-    mov     esi,[esp+in_blk+stk_spc] ; input pointer
-    mov     eax,[esi   ]
-    mov     ebx,[esi+ 4]
-    mov     ecx,[esi+ 8]
-    mov     edx,[esi+12]
-    lea     esi,[esi+16]
-
-    mov     ebp,[esp+ctx+stk_spc]    ; key pointer
-    movzx   edi,byte[ebp+4*KS_LENGTH]
-%ifndef  AES_REV_DKS        ; if decryption key schedule is not reversed
-    lea     ebp,[ebp+edi]   ; we have to access it from the top down
-%endif
-    xor     eax,[ebp   ]    ; key schedule
-    xor     ebx,[ebp+ 4]
-    xor     ecx,[ebp+ 8]
-    xor     edx,[ebp+12]
-
-; determine the number of rounds
-
-    cmp     edi,10*16
-    je      .3
-    cmp     edi,12*16
-    je      .2
-    cmp     edi,14*16
-    je      .1
-    mov     eax,-1
-    jmp     .5
-
-.1: dec_round
-    dec_round
-.2: dec_round
-    dec_round
-.3: dec_round
-    dec_round
-    dec_round
-    dec_round
-    dec_round
-    dec_round
-    dec_round
-    dec_round
-    dec_round
-    dec_last_round
-
-; move final values to the output array.
-
-    mov     ebp,[esp+out_blk+stk_spc]
-    mov     [ebp],eax
-    mov     [ebp+4],ebx
-    mov     [ebp+8],esi
-    mov     [ebp+12],edi
-    xor     eax,eax
-
-.5: mov     ebp,[esp+16]
-    mov     ebx,[esp+12]
-    mov     esi,[esp+ 8]
-    mov     edi,[esp+ 4]
-    add     esp,stk_spc
-    do_exit
-
-%endif
+
+; ---------------------------------------------------------------------------
+; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+; 
+; LICENSE TERMS
+; 
+; The free distribution and use of this software is allowed (with or without
+; changes) provided that:
+; 
+;  1. source code distributions include the above copyright notice, this
+;     list of conditions and the following disclaimer;
+; 
+;  2. binary distributions include the above copyright notice, this list
+;     of conditions and the following disclaimer in their documentation;
+; 
+;  3. the name of the copyright holder is not used to endorse products
+;     built using this software without specific written permission.
+; 
+; DISCLAIMER
+; 
+; This software is provided 'as is' with no explicit or implied warranties
+; in respect of its properties, including, but not limited to, correctness
+; and/or fitness for purpose.
+; ---------------------------------------------------------------------------
+; Issue 20/12/2007
+;
+; This code requires ASM_X86_V1C to be set in aesopt.h. It requires the C files
+; aeskey.c and aestab.c for support.
+
+;
+; Adapted for TrueCrypt:
+; - Compatibility with NASM and GCC
+;
+
+; An AES implementation for x86 processors using the YASM (or NASM) assembler.
+; This is an assembler implementation that covers encryption and decryption
+; only and is intended as a replacement of the C file aescrypt.c. It hence
+; requires the file aeskey.c for keying and aestab.c for the AES tables. It
+; employs full tables rather than compressed tables.
+
+; This code provides the standard AES block size (128 bits, 16 bytes) and the
+; three standard AES key sizes (128, 192 and 256 bits). It has the same call
+; interface as my C implementation. The ebx, esi, edi and ebp registers are
+; preserved across calls but eax, ecx and edx and the artihmetic status flags
+; are not.  It is also important that the defines below match those used in the
+; C code.  This code uses the VC++ register saving conentions; if it is used
+; with another compiler, conventions for using and saving registers may need to
+; be checked (and calling conventions).  The YASM command line for the VC++
+; custom build step is:
+;
+;    yasm -Xvc -f win32 -o "$(TargetDir)\$(InputName).obj" "$(InputPath)"
+;
+;  The calling intefaces are:
+;
+;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
+;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
+;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
+;                                            const aes_encrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
+;                                            const aes_decrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_encrypt_key(const unsigned char key[],
+;                           unsigned int len, const aes_decrypt_ctx cx[1]);
+;
+;     AES_RETURN aes_decrypt_key(const unsigned char key[],
+;                           unsigned int len, const aes_decrypt_ctx cx[1]);
+;
+; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
+; either bits or bytes.
+;
+; Comment in/out the following lines to obtain the desired subroutines. These
+; selections MUST match those in the C header file aes.h
+
+; %define AES_128                 ; define if AES with 128 bit keys is needed
+; %define AES_192                 ; define if AES with 192 bit keys is needed
+%define AES_256                 ; define if AES with 256 bit keys is needed
+; %define AES_VAR                 ; define if a variable key size is needed
+%define ENCRYPTION              ; define if encryption is needed
+%define DECRYPTION              ; define if decryption is needed
+%define AES_REV_DKS             ; define if key decryption schedule is reversed
+%define LAST_ROUND_TABLES       ; define if tables are to be used for last round
+
+; offsets to parameters
+
+in_blk  equ     4   ; input byte array address parameter
+out_blk equ     8   ; output byte array address parameter
+ctx     equ    12   ; AES context structure
+stk_spc equ    20   ; stack space
+%define parms  12   ; parameter space on stack
+
+; The encryption key schedule has the following in memory layout where N is the
+; number of rounds (10, 12 or 14):
+;
+; lo: | input key (round 0)  |  ; each round is four 32-bit words
+;     | encryption round 1   |
+;     | encryption round 2   |
+;     ....
+;     | encryption round N-1 |
+; hi: | encryption round N   |
+;
+; The decryption key schedule is normally set up so that it has the same
+; layout as above by actually reversing the order of the encryption key
+; schedule in memory (this happens when AES_REV_DKS is set):
+;
+; lo: | decryption round 0   | =              | encryption round N   |
+;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
+;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
+;     ....                       ....
+;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
+; hi: | decryption round N   | =              | input key (round 0)  |
+;
+; with rounds except the first and last modified using inv_mix_column()
+; But if AES_REV_DKS is NOT set the order of keys is left as it is for
+; encryption so that it has to be accessed in reverse when used for
+; decryption (although the inverse mix column modifications are done)
+;
+; lo: | decryption round 0   | =              | input key (round 0)  |
+;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
+;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
+;     ....                       ....
+;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
+; hi: | decryption round N   | =              | encryption round N   |
+;
+; This layout is faster when the assembler key scheduling provided here
+; is used.
+;
+; The DLL interface must use the _stdcall convention in which the number
+; of bytes of parameter space is added after an @ to the sutine's name.
+; We must also remove our parameters from the stack before return (see
+; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
+
+;%define DLL_EXPORT
+
+; End of user defines
+
+%ifdef AES_VAR
+%ifndef AES_128
+%define AES_128
+%endif
+%ifndef AES_192
+%define AES_192
+%endif
+%ifndef AES_256
+%define AES_256
+%endif
+%endif
+
+%ifdef AES_VAR
+%define KS_LENGTH       60
+%elifdef AES_256
+%define KS_LENGTH       60
+%elifdef AES_192
+%define KS_LENGTH       52
+%else
+%define KS_LENGTH       44
+%endif
+
+; These macros implement stack based local variables
+
+%macro  save 2
+    mov     [esp+4*%1],%2
+%endmacro
+
+%macro  restore 2
+    mov     %1,[esp+4*%2]
+%endmacro
+
+; the DLL has to implement the _stdcall calling interface on return
+; In this case we have to take our parameters (3 4-byte pointers)
+; off the stack
+
+%macro  do_name 1-2 parms
+%ifndef DLL_EXPORT
+    align 32
+    global  %1
+%1:
+%else
+    align 32
+    global  %1@%2
+    export  _%1@%2
+%1@%2:
+%endif
+%endmacro
+
+%macro  do_call 1-2 parms
+%ifndef DLL_EXPORT
+    call    %1
+    add     esp,%2
+%else
+    call    %1@%2
+%endif
+%endmacro
+
+%macro  do_exit  0-1 parms
+%ifdef DLL_EXPORT
+    ret %1
+%else
+    ret
+%endif
+%endmacro
+
+%ifdef  ENCRYPTION
+
+    extern  t_fn
+
+%define etab_0(x)   [t_fn+4*x]
+%define etab_1(x)   [t_fn+1024+4*x]
+%define etab_2(x)   [t_fn+2048+4*x]
+%define etab_3(x)   [t_fn+3072+4*x]
+
+%ifdef LAST_ROUND_TABLES
+
+    extern  t_fl
+
+%define eltab_0(x)  [t_fl+4*x]
+%define eltab_1(x)  [t_fl+1024+4*x]
+%define eltab_2(x)  [t_fl+2048+4*x]
+%define eltab_3(x)  [t_fl+3072+4*x]
+
+%else
+
+%define etab_b(x)   byte [t_fn+3072+4*x]
+
+%endif
+
+; ROUND FUNCTION.  Build column[2] on ESI and column[3] on EDI that have the
+; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX.
+;
+; Input:
+;
+;   EAX     column[0]
+;   EBX     column[1]
+;   ECX     column[2]
+;   EDX     column[3]
+;   ESI     column key[round][2]
+;   EDI     column key[round][3]
+;   EBP     scratch
+;
+; Output:
+;
+;   EBP     column[0]   unkeyed
+;   EBX     column[1]   unkeyed
+;   ESI     column[2]   keyed
+;   EDI     column[3]   keyed
+;   EAX     scratch
+;   ECX     scratch
+;   EDX     scratch
+
+%macro rnd_fun 2
+
+    rol     ebx,16
+    %1      esi, cl, 0, ebp
+    %1      esi, dh, 1, ebp
+    %1      esi, bh, 3, ebp
+    %1      edi, dl, 0, ebp
+    %1      edi, ah, 1, ebp
+    %1      edi, bl, 2, ebp
+    %2      ebp, al, 0, ebp
+    shr     ebx,16
+    and     eax,0xffff0000
+    or      eax,ebx
+    shr     edx,16
+    %1      ebp, ah, 1, ebx
+    %1      ebp, dh, 3, ebx
+    %2      ebx, dl, 2, ebx
+    %1      ebx, ch, 1, edx
+    %1      ebx, al, 0, edx
+    shr     eax,16
+    shr     ecx,16
+    %1      ebp, cl, 2, edx
+    %1      edi, ch, 3, edx
+    %1      esi, al, 2, edx
+    %1      ebx, ah, 3, edx
+
+%endmacro
+
+; Basic MOV and XOR Operations for normal rounds
+
+%macro  nr_xor  4
+    movzx   %4,%2
+    xor     %1,etab_%3(%4)
+%endmacro
+
+%macro  nr_mov  4
+    movzx   %4,%2
+    mov     %1,etab_%3(%4)
+%endmacro
+
+; Basic MOV and XOR Operations for last round
+
+%ifdef LAST_ROUND_TABLES
+
+    %macro  lr_xor  4
+        movzx   %4,%2
+        xor     %1,eltab_%3(%4)
+    %endmacro
+
+    %macro  lr_mov  4
+        movzx   %4,%2
+        mov     %1,eltab_%3(%4)
+    %endmacro
+
+%else
+
+    %macro  lr_xor  4
+        movzx   %4,%2
+        movzx   %4,etab_b(%4)
+    %if %3 != 0
+        shl     %4,8*%3
+    %endif
+        xor     %1,%4
+    %endmacro
+
+    %macro  lr_mov  4
+        movzx   %4,%2
+        movzx   %1,etab_b(%4)
+    %if %3 != 0
+        shl     %1,8*%3
+    %endif
+    %endmacro
+
+%endif
+
+%macro enc_round 0
+
+    add     ebp,16
+    save    0,ebp
+    mov     esi,[ebp+8]
+    mov     edi,[ebp+12]
+
+    rnd_fun nr_xor, nr_mov
+
+    mov     eax,ebp
+    mov     ecx,esi
+    mov     edx,edi
+    restore ebp,0
+    xor     eax,[ebp]
+    xor     ebx,[ebp+4]
+
+%endmacro
+
+%macro enc_last_round 0
+
+    add     ebp,16
+    save    0,ebp
+    mov     esi,[ebp+8]
+    mov     edi,[ebp+12]
+
+    rnd_fun lr_xor, lr_mov
+
+    mov     eax,ebp
+    restore ebp,0
+    xor     eax,[ebp]
+    xor     ebx,[ebp+4]
+
+%endmacro
+
+    section .text align=32
+
+; AES Encryption Subroutine
+
+    do_name aes_encrypt
+
+    sub     esp,stk_spc
+    mov     [esp+16],ebp
+    mov     [esp+12],ebx
+    mov     [esp+ 8],esi
+    mov     [esp+ 4],edi
+
+    mov     esi,[esp+in_blk+stk_spc] ; input pointer
+    mov     eax,[esi   ]
+    mov     ebx,[esi+ 4]
+    mov     ecx,[esi+ 8]
+    mov     edx,[esi+12]
+
+    mov     ebp,[esp+ctx+stk_spc]    ; key pointer
+    movzx   edi,byte [ebp+4*KS_LENGTH]
+    xor     eax,[ebp   ]
+    xor     ebx,[ebp+ 4]
+    xor     ecx,[ebp+ 8]
+    xor     edx,[ebp+12]
+
+; determine the number of rounds
+
+    cmp     edi,10*16
+    je      .3
+    cmp     edi,12*16
+    je      .2
+    cmp     edi,14*16
+    je      .1
+    mov     eax,-1
+    jmp     .5
+
+.1: enc_round
+    enc_round
+.2: enc_round
+    enc_round
+.3: enc_round
+    enc_round
+    enc_round
+    enc_round
+    enc_round
+    enc_round
+    enc_round
+    enc_round
+    enc_round
+    enc_last_round
+
+    mov     edx,[esp+out_blk+stk_spc]
+    mov     [edx],eax
+    mov     [edx+4],ebx
+    mov     [edx+8],esi
+    mov     [edx+12],edi
+    xor     eax,eax
+
+.5: mov     ebp,[esp+16]
+    mov     ebx,[esp+12]
+    mov     esi,[esp+ 8]
+    mov     edi,[esp+ 4]
+    add     esp,stk_spc
+    do_exit
+
+%endif
+
+%ifdef  DECRYPTION
+
+    extern  t_in
+
+%define dtab_0(x)   [t_in+4*x]
+%define dtab_1(x)   [t_in+1024+4*x]
+%define dtab_2(x)   [t_in+2048+4*x]
+%define dtab_3(x)   [t_in+3072+4*x]
+
+%ifdef LAST_ROUND_TABLES
+
+    extern  t_il
+
+%define dltab_0(x)  [t_il+4*x]
+%define dltab_1(x)  [t_il+1024+4*x]
+%define dltab_2(x)  [t_il+2048+4*x]
+%define dltab_3(x)  [t_il+3072+4*x]
+
+%else
+
+    extern  _t_ibox
+
+%define dtab_x(x)   byte [_t_ibox+x]
+
+%endif
+
+%macro irn_fun 2
+
+    rol eax,16
+    %1      esi, cl, 0, ebp
+    %1      esi, bh, 1, ebp
+    %1      esi, al, 2, ebp
+    %1      edi, dl, 0, ebp
+    %1      edi, ch, 1, ebp
+    %1      edi, ah, 3, ebp
+    %2      ebp, bl, 0, ebp
+    shr     eax,16
+    and     ebx,0xffff0000
+    or      ebx,eax
+    shr     ecx,16
+    %1      ebp, bh, 1, eax
+    %1      ebp, ch, 3, eax
+    %2      eax, cl, 2, ecx
+    %1      eax, bl, 0, ecx
+    %1      eax, dh, 1, ecx
+    shr     ebx,16
+    shr     edx,16
+    %1      esi, dh, 3, ecx
+    %1      ebp, dl, 2, ecx
+    %1      eax, bh, 3, ecx
+    %1      edi, bl, 2, ecx
+
+%endmacro
+
+; Basic MOV and XOR Operations for normal rounds
+
+%macro  ni_xor  4
+    movzx   %4,%2
+    xor     %1,dtab_%3(%4)
+%endmacro
+
+%macro  ni_mov  4
+    movzx   %4,%2
+    mov     %1,dtab_%3(%4)
+%endmacro
+
+; Basic MOV and XOR Operations for last round
+
+%ifdef LAST_ROUND_TABLES
+
+%macro  li_xor  4
+    movzx   %4,%2
+    xor     %1,dltab_%3(%4)
+%endmacro
+
+%macro  li_mov  4
+    movzx   %4,%2
+    mov     %1,dltab_%3(%4)
+%endmacro
+
+%else
+
+    %macro  li_xor  4
+        movzx   %4,%2
+        movzx   %4,dtab_x(%4)
+    %if %3 != 0
+        shl     %4,8*%3
+    %endif
+        xor     %1,%4
+    %endmacro
+
+    %macro  li_mov  4
+        movzx   %4,%2
+        movzx   %1,dtab_x(%4)
+    %if %3 != 0
+        shl     %1,8*%3
+    %endif
+    %endmacro
+
+%endif
+
+%macro dec_round 0
+
+%ifdef AES_REV_DKS
+    add     ebp,16
+%else
+    sub     ebp,16
+%endif
+    save    0,ebp
+    mov     esi,[ebp+8]
+    mov     edi,[ebp+12]
+
+    irn_fun ni_xor, ni_mov
+
+    mov     ebx,ebp
+    mov     ecx,esi
+    mov     edx,edi
+    restore ebp,0
+    xor     eax,[ebp]
+    xor     ebx,[ebp+4]
+
+%endmacro
+
+%macro dec_last_round 0
+
+%ifdef AES_REV_DKS
+    add     ebp,16
+%else
+    sub     ebp,16
+%endif
+    save    0,ebp
+    mov     esi,[ebp+8]
+    mov     edi,[ebp+12]
+
+    irn_fun li_xor, li_mov
+
+    mov     ebx,ebp
+    restore ebp,0
+    xor     eax,[ebp]
+    xor     ebx,[ebp+4]
+
+%endmacro
+
+    section .text
+
+; AES Decryption Subroutine
+
+    do_name aes_decrypt
+
+    sub     esp,stk_spc
+    mov     [esp+16],ebp
+    mov     [esp+12],ebx
+    mov     [esp+ 8],esi
+    mov     [esp+ 4],edi
+
+; input four columns and xor in first round key
+
+    mov     esi,[esp+in_blk+stk_spc] ; input pointer
+    mov     eax,[esi   ]
+    mov     ebx,[esi+ 4]
+    mov     ecx,[esi+ 8]
+    mov     edx,[esi+12]
+    lea     esi,[esi+16]
+
+    mov     ebp,[esp+ctx+stk_spc]    ; key pointer
+    movzx   edi,byte[ebp+4*KS_LENGTH]
+%ifndef  AES_REV_DKS        ; if decryption key schedule is not reversed
+    lea     ebp,[ebp+edi]   ; we have to access it from the top down
+%endif
+    xor     eax,[ebp   ]    ; key schedule
+    xor     ebx,[ebp+ 4]
+    xor     ecx,[ebp+ 8]
+    xor     edx,[ebp+12]
+
+; determine the number of rounds
+
+    cmp     edi,10*16
+    je      .3
+    cmp     edi,12*16
+    je      .2
+    cmp     edi,14*16
+    je      .1
+    mov     eax,-1
+    jmp     .5
+
+.1: dec_round
+    dec_round
+.2: dec_round
+    dec_round
+.3: dec_round
+    dec_round
+    dec_round
+    dec_round
+    dec_round
+    dec_round
+    dec_round
+    dec_round
+    dec_round
+    dec_last_round
+
+; move final values to the output array.
+
+    mov     ebp,[esp+out_blk+stk_spc]
+    mov     [ebp],eax
+    mov     [ebp+4],ebx
+    mov     [ebp+8],esi
+    mov     [ebp+12],edi
+    xor     eax,eax
+
+.5: mov     ebp,[esp+16]
+    mov     ebx,[esp+12]
+    mov     esi,[esp+ 8]
+    mov     edi,[esp+ 4]
+    add     esp,stk_spc
+    do_exit
+
+%endif
diff --git a/src/Crypto/Aescrypt.c b/src/Crypto/Aescrypt.c
index c77ec675..46175981 100644
--- a/src/Crypto/Aescrypt.c
+++ b/src/Crypto/Aescrypt.c
@@ -1,311 +1,311 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 20/12/2007
-*/
-
-#include "Aesopt.h"
-#include "Aestab.h"
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-#define si(y,x,k,c) (s(y,c) = word_in(x, c) ^ (k)[c])
-#define so(y,x,c)   word_out(y, c, s(x,c))
-
-#if defined(ARRAYS)
-#define locals(y,x)     x[4],y[4]
-#else
-#define locals(y,x)     x##0,x##1,x##2,x##3,y##0,y##1,y##2,y##3
-#endif
-
-#define l_copy(y, x)    s(y,0) = s(x,0); s(y,1) = s(x,1); \
-                        s(y,2) = s(x,2); s(y,3) = s(x,3);
-#define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); si(y,x,k,3)
-#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
-#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3)
-
-#if ( FUNCS_IN_C & ENCRYPTION_IN_C )
-
-/* Visual C++ .Net v7.1 provides the fastest encryption code when using
-   Pentium optimiation with small code but this is poor for decryption
-   so we need to control this with the following VC++ pragmas
-*/
-
-#if defined( _MSC_VER ) && !defined( _WIN64 )
-#pragma optimize( "s", on )
-#endif
-
-/* Given the column (c) of the output state variable, the following
-   macros give the input state variables which are needed in its
-   computation for each row (r) of the state. All the alternative
-   macros give the same end values but expand into different ways
-   of calculating these values.  In particular the complex macro
-   used for dynamically variable block sizes is designed to expand
-   to a compile time constant whenever possible but will expand to
-   conditional clauses on some branches (I am grateful to Frank
-   Yellin for this construction)
-*/
-
-#define fwd_var(x,r,c)\
- ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
- : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
- : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
- :          ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
-
-#if defined(FT4_SET)
-#undef  dec_fmvars
-#define fwd_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
-#elif defined(FT1_SET)
-#undef  dec_fmvars
-#define fwd_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ one_table(x,upr,t_use(f,n),fwd_var,rf1,c))
-#else
-#define fwd_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ fwd_mcol(no_table(x,t_use(s,box),fwd_var,rf1,c)))
-#endif
-
-#if defined(FL4_SET)
-#define fwd_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,l),fwd_var,rf1,c))
-#elif defined(FL1_SET)
-#define fwd_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ one_table(x,ups,t_use(f,l),fwd_var,rf1,c))
-#else
-#define fwd_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ no_table(x,t_use(s,box),fwd_var,rf1,c))
-#endif
-
-AES_RETURN aes_encrypt(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1])
-{   uint_32t         locals(b0, b1);
-    const uint_32t   *kp;
-#if defined( dec_fmvars )
-    dec_fmvars; /* declare variables for fwd_mcol() if needed */
-#endif
-
-#if defined( AES_ERR_CHK )
-    if( cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16 )
-        return EXIT_FAILURE;
-#endif
-
-    kp = cx->ks;
-    state_in(b0, in, kp);
-
-#if (ENC_UNROLL == FULL)
-
-    switch(cx->inf.b[0])
-    {
-    case 14 * 16:
-        round(fwd_rnd,  b1, b0, kp + 1 * N_COLS);
-        round(fwd_rnd,  b0, b1, kp + 2 * N_COLS);
-        kp += 2 * N_COLS;
-    case 12 * 16:
-        round(fwd_rnd,  b1, b0, kp + 1 * N_COLS);
-        round(fwd_rnd,  b0, b1, kp + 2 * N_COLS);
-        kp += 2 * N_COLS;
-    case 10 * 16:
-        round(fwd_rnd,  b1, b0, kp + 1 * N_COLS);
-        round(fwd_rnd,  b0, b1, kp + 2 * N_COLS);
-        round(fwd_rnd,  b1, b0, kp + 3 * N_COLS);
-        round(fwd_rnd,  b0, b1, kp + 4 * N_COLS);
-        round(fwd_rnd,  b1, b0, kp + 5 * N_COLS);
-        round(fwd_rnd,  b0, b1, kp + 6 * N_COLS);
-        round(fwd_rnd,  b1, b0, kp + 7 * N_COLS);
-        round(fwd_rnd,  b0, b1, kp + 8 * N_COLS);
-        round(fwd_rnd,  b1, b0, kp + 9 * N_COLS);
-        round(fwd_lrnd, b0, b1, kp +10 * N_COLS);
-    }
-
-#else
-
-#if (ENC_UNROLL == PARTIAL)
-    {   uint_32t    rnd;
-        for(rnd = 0; rnd < (cx->inf.b[0] >> 5) - 1; ++rnd)
-        {
-            kp += N_COLS;
-            round(fwd_rnd, b1, b0, kp);
-            kp += N_COLS;
-            round(fwd_rnd, b0, b1, kp);
-        }
-        kp += N_COLS;
-        round(fwd_rnd,  b1, b0, kp);
-#else
-    {   uint_32t    rnd;
-        for(rnd = 0; rnd < (cx->inf.b[0] >> 4) - 1; ++rnd)
-        {
-            kp += N_COLS;
-            round(fwd_rnd, b1, b0, kp);
-            l_copy(b0, b1);
-        }
-#endif
-        kp += N_COLS;
-        round(fwd_lrnd, b0, b1, kp);
-    }
-#endif
-
-    state_out(out, b0);
-
-#if defined( AES_ERR_CHK )
-    return EXIT_SUCCESS;
-#endif
-}
-
-#endif
-
-#if ( FUNCS_IN_C & DECRYPTION_IN_C)
-
-/* Visual C++ .Net v7.1 provides the fastest encryption code when using
-   Pentium optimiation with small code but this is poor for decryption
-   so we need to control this with the following VC++ pragmas
-*/
-
-#if defined( _MSC_VER ) && !defined( _WIN64 )
-#pragma optimize( "t", on )
-#endif
-
-/* Given the column (c) of the output state variable, the following
-   macros give the input state variables which are needed in its
-   computation for each row (r) of the state. All the alternative
-   macros give the same end values but expand into different ways
-   of calculating these values.  In particular the complex macro
-   used for dynamically variable block sizes is designed to expand
-   to a compile time constant whenever possible but will expand to
-   conditional clauses on some branches (I am grateful to Frank
-   Yellin for this construction)
-*/
-
-#define inv_var(x,r,c)\
- ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
- : r == 1 ? ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))\
- : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
- :          ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0)))
-
-#if defined(IT4_SET)
-#undef  dec_imvars
-#define inv_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ four_tables(x,t_use(i,n),inv_var,rf1,c))
-#elif defined(IT1_SET)
-#undef  dec_imvars
-#define inv_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ one_table(x,upr,t_use(i,n),inv_var,rf1,c))
-#else
-#define inv_rnd(y,x,k,c)    (s(y,c) = inv_mcol((k)[c] ^ no_table(x,t_use(i,box),inv_var,rf1,c)))
-#endif
-
-#if defined(IL4_SET)
-#define inv_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ four_tables(x,t_use(i,l),inv_var,rf1,c))
-#elif defined(IL1_SET)
-#define inv_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ one_table(x,ups,t_use(i,l),inv_var,rf1,c))
-#else
-#define inv_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ no_table(x,t_use(i,box),inv_var,rf1,c))
-#endif
-
-/* This code can work with the decryption key schedule in the   */
-/* order that is used for encrytpion (where the 1st decryption  */
-/* round key is at the high end ot the schedule) or with a key  */
-/* schedule that has been reversed to put the 1st decryption    */
-/* round key at the low end of the schedule in memory (when     */
-/* AES_REV_DKS is defined)                                      */
-
-#ifdef AES_REV_DKS
-#define key_ofs     0
-#define rnd_key(n)  (kp + n * N_COLS)
-#else
-#define key_ofs     1
-#define rnd_key(n)  (kp - n * N_COLS)
-#endif
-
-AES_RETURN aes_decrypt(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1])
-{   uint_32t        locals(b0, b1);
-#if defined( dec_imvars )
-    dec_imvars; /* declare variables for inv_mcol() if needed */
-#endif
-    const uint_32t *kp;
-
-#if defined( AES_ERR_CHK )
-    if( cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16 )
-        return EXIT_FAILURE;
-#endif
-
-    kp = cx->ks + (key_ofs ? (cx->inf.b[0] >> 2) : 0);
-    state_in(b0, in, kp);
-
-#if (DEC_UNROLL == FULL)
-
-    kp = cx->ks + (key_ofs ? 0 : (cx->inf.b[0] >> 2));
-    switch(cx->inf.b[0])
-    {
-    case 14 * 16:
-        round(inv_rnd,  b1, b0, rnd_key(-13));
-        round(inv_rnd,  b0, b1, rnd_key(-12));
-    case 12 * 16:
-        round(inv_rnd,  b1, b0, rnd_key(-11));
-        round(inv_rnd,  b0, b1, rnd_key(-10));
-    case 10 * 16:
-        round(inv_rnd,  b1, b0, rnd_key(-9));
-        round(inv_rnd,  b0, b1, rnd_key(-8));
-        round(inv_rnd,  b1, b0, rnd_key(-7));
-        round(inv_rnd,  b0, b1, rnd_key(-6));
-        round(inv_rnd,  b1, b0, rnd_key(-5));
-        round(inv_rnd,  b0, b1, rnd_key(-4));
-        round(inv_rnd,  b1, b0, rnd_key(-3));
-        round(inv_rnd,  b0, b1, rnd_key(-2));
-        round(inv_rnd,  b1, b0, rnd_key(-1));
-        round(inv_lrnd, b0, b1, rnd_key( 0));
-    }
-
-#else
-
-#if (DEC_UNROLL == PARTIAL)
-    {   uint_32t    rnd;
-        for(rnd = 0; rnd < (cx->inf.b[0] >> 5) - 1; ++rnd)
-        {
-            kp = rnd_key(1);
-            round(inv_rnd, b1, b0, kp);
-            kp = rnd_key(1);
-            round(inv_rnd, b0, b1, kp);
-        }
-        kp = rnd_key(1);
-        round(inv_rnd, b1, b0, kp);
-#else
-    {   uint_32t    rnd;
-        for(rnd = 0; rnd < (cx->inf.b[0] >> 4) - 1; ++rnd)
-        {
-            kp = rnd_key(1);
-            round(inv_rnd, b1, b0, kp);
-            l_copy(b0, b1);
-        }
-#endif
-        kp = rnd_key(1);
-        round(inv_lrnd, b0, b1, kp);
-        }
-#endif
-
-    state_out(out, b0);
-
-#if defined( AES_ERR_CHK )
-    return EXIT_SUCCESS;
-#endif
-}
-
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+*/
+
+#include "Aesopt.h"
+#include "Aestab.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#define si(y,x,k,c) (s(y,c) = word_in(x, c) ^ (k)[c])
+#define so(y,x,c)   word_out(y, c, s(x,c))
+
+#if defined(ARRAYS)
+#define locals(y,x)     x[4],y[4]
+#else
+#define locals(y,x)     x##0,x##1,x##2,x##3,y##0,y##1,y##2,y##3
+#endif
+
+#define l_copy(y, x)    s(y,0) = s(x,0); s(y,1) = s(x,1); \
+                        s(y,2) = s(x,2); s(y,3) = s(x,3);
+#define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); si(y,x,k,3)
+#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
+#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3)
+
+#if ( FUNCS_IN_C & ENCRYPTION_IN_C )
+
+/* Visual C++ .Net v7.1 provides the fastest encryption code when using
+   Pentium optimiation with small code but this is poor for decryption
+   so we need to control this with the following VC++ pragmas
+*/
+
+#if defined( _MSC_VER ) && !defined( _WIN64 )
+#pragma optimize( "s", on )
+#endif
+
+/* Given the column (c) of the output state variable, the following
+   macros give the input state variables which are needed in its
+   computation for each row (r) of the state. All the alternative
+   macros give the same end values but expand into different ways
+   of calculating these values.  In particular the complex macro
+   used for dynamically variable block sizes is designed to expand
+   to a compile time constant whenever possible but will expand to
+   conditional clauses on some branches (I am grateful to Frank
+   Yellin for this construction)
+*/
+
+#define fwd_var(x,r,c)\
+ ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
+ : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
+ : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
+ :          ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
+
+#if defined(FT4_SET)
+#undef  dec_fmvars
+#define fwd_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
+#elif defined(FT1_SET)
+#undef  dec_fmvars
+#define fwd_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ one_table(x,upr,t_use(f,n),fwd_var,rf1,c))
+#else
+#define fwd_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ fwd_mcol(no_table(x,t_use(s,box),fwd_var,rf1,c)))
+#endif
+
+#if defined(FL4_SET)
+#define fwd_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,l),fwd_var,rf1,c))
+#elif defined(FL1_SET)
+#define fwd_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ one_table(x,ups,t_use(f,l),fwd_var,rf1,c))
+#else
+#define fwd_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ no_table(x,t_use(s,box),fwd_var,rf1,c))
+#endif
+
+AES_RETURN aes_encrypt(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1])
+{   uint_32t         locals(b0, b1);
+    const uint_32t   *kp;
+#if defined( dec_fmvars )
+    dec_fmvars; /* declare variables for fwd_mcol() if needed */
+#endif
+
+#if defined( AES_ERR_CHK )
+    if( cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16 )
+        return EXIT_FAILURE;
+#endif
+
+    kp = cx->ks;
+    state_in(b0, in, kp);
+
+#if (ENC_UNROLL == FULL)
+
+    switch(cx->inf.b[0])
+    {
+    case 14 * 16:
+        round(fwd_rnd,  b1, b0, kp + 1 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 2 * N_COLS);
+        kp += 2 * N_COLS;
+    case 12 * 16:
+        round(fwd_rnd,  b1, b0, kp + 1 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 2 * N_COLS);
+        kp += 2 * N_COLS;
+    case 10 * 16:
+        round(fwd_rnd,  b1, b0, kp + 1 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 2 * N_COLS);
+        round(fwd_rnd,  b1, b0, kp + 3 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 4 * N_COLS);
+        round(fwd_rnd,  b1, b0, kp + 5 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 6 * N_COLS);
+        round(fwd_rnd,  b1, b0, kp + 7 * N_COLS);
+        round(fwd_rnd,  b0, b1, kp + 8 * N_COLS);
+        round(fwd_rnd,  b1, b0, kp + 9 * N_COLS);
+        round(fwd_lrnd, b0, b1, kp +10 * N_COLS);
+    }
+
+#else
+
+#if (ENC_UNROLL == PARTIAL)
+    {   uint_32t    rnd;
+        for(rnd = 0; rnd < (cx->inf.b[0] >> 5) - 1; ++rnd)
+        {
+            kp += N_COLS;
+            round(fwd_rnd, b1, b0, kp);
+            kp += N_COLS;
+            round(fwd_rnd, b0, b1, kp);
+        }
+        kp += N_COLS;
+        round(fwd_rnd,  b1, b0, kp);
+#else
+    {   uint_32t    rnd;
+        for(rnd = 0; rnd < (cx->inf.b[0] >> 4) - 1; ++rnd)
+        {
+            kp += N_COLS;
+            round(fwd_rnd, b1, b0, kp);
+            l_copy(b0, b1);
+        }
+#endif
+        kp += N_COLS;
+        round(fwd_lrnd, b0, b1, kp);
+    }
+#endif
+
+    state_out(out, b0);
+
+#if defined( AES_ERR_CHK )
+    return EXIT_SUCCESS;
+#endif
+}
+
+#endif
+
+#if ( FUNCS_IN_C & DECRYPTION_IN_C)
+
+/* Visual C++ .Net v7.1 provides the fastest encryption code when using
+   Pentium optimiation with small code but this is poor for decryption
+   so we need to control this with the following VC++ pragmas
+*/
+
+#if defined( _MSC_VER ) && !defined( _WIN64 )
+#pragma optimize( "t", on )
+#endif
+
+/* Given the column (c) of the output state variable, the following
+   macros give the input state variables which are needed in its
+   computation for each row (r) of the state. All the alternative
+   macros give the same end values but expand into different ways
+   of calculating these values.  In particular the complex macro
+   used for dynamically variable block sizes is designed to expand
+   to a compile time constant whenever possible but will expand to
+   conditional clauses on some branches (I am grateful to Frank
+   Yellin for this construction)
+*/
+
+#define inv_var(x,r,c)\
+ ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
+ : r == 1 ? ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))\
+ : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
+ :          ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0)))
+
+#if defined(IT4_SET)
+#undef  dec_imvars
+#define inv_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ four_tables(x,t_use(i,n),inv_var,rf1,c))
+#elif defined(IT1_SET)
+#undef  dec_imvars
+#define inv_rnd(y,x,k,c)    (s(y,c) = (k)[c] ^ one_table(x,upr,t_use(i,n),inv_var,rf1,c))
+#else
+#define inv_rnd(y,x,k,c)    (s(y,c) = inv_mcol((k)[c] ^ no_table(x,t_use(i,box),inv_var,rf1,c)))
+#endif
+
+#if defined(IL4_SET)
+#define inv_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ four_tables(x,t_use(i,l),inv_var,rf1,c))
+#elif defined(IL1_SET)
+#define inv_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ one_table(x,ups,t_use(i,l),inv_var,rf1,c))
+#else
+#define inv_lrnd(y,x,k,c)   (s(y,c) = (k)[c] ^ no_table(x,t_use(i,box),inv_var,rf1,c))
+#endif
+
+/* This code can work with the decryption key schedule in the   */
+/* order that is used for encrytpion (where the 1st decryption  */
+/* round key is at the high end ot the schedule) or with a key  */
+/* schedule that has been reversed to put the 1st decryption    */
+/* round key at the low end of the schedule in memory (when     */
+/* AES_REV_DKS is defined)                                      */
+
+#ifdef AES_REV_DKS
+#define key_ofs     0
+#define rnd_key(n)  (kp + n * N_COLS)
+#else
+#define key_ofs     1
+#define rnd_key(n)  (kp - n * N_COLS)
+#endif
+
+AES_RETURN aes_decrypt(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1])
+{   uint_32t        locals(b0, b1);
+#if defined( dec_imvars )
+    dec_imvars; /* declare variables for inv_mcol() if needed */
+#endif
+    const uint_32t *kp;
+
+#if defined( AES_ERR_CHK )
+    if( cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16 )
+        return EXIT_FAILURE;
+#endif
+
+    kp = cx->ks + (key_ofs ? (cx->inf.b[0] >> 2) : 0);
+    state_in(b0, in, kp);
+
+#if (DEC_UNROLL == FULL)
+
+    kp = cx->ks + (key_ofs ? 0 : (cx->inf.b[0] >> 2));
+    switch(cx->inf.b[0])
+    {
+    case 14 * 16:
+        round(inv_rnd,  b1, b0, rnd_key(-13));
+        round(inv_rnd,  b0, b1, rnd_key(-12));
+    case 12 * 16:
+        round(inv_rnd,  b1, b0, rnd_key(-11));
+        round(inv_rnd,  b0, b1, rnd_key(-10));
+    case 10 * 16:
+        round(inv_rnd,  b1, b0, rnd_key(-9));
+        round(inv_rnd,  b0, b1, rnd_key(-8));
+        round(inv_rnd,  b1, b0, rnd_key(-7));
+        round(inv_rnd,  b0, b1, rnd_key(-6));
+        round(inv_rnd,  b1, b0, rnd_key(-5));
+        round(inv_rnd,  b0, b1, rnd_key(-4));
+        round(inv_rnd,  b1, b0, rnd_key(-3));
+        round(inv_rnd,  b0, b1, rnd_key(-2));
+        round(inv_rnd,  b1, b0, rnd_key(-1));
+        round(inv_lrnd, b0, b1, rnd_key( 0));
+    }
+
+#else
+
+#if (DEC_UNROLL == PARTIAL)
+    {   uint_32t    rnd;
+        for(rnd = 0; rnd < (cx->inf.b[0] >> 5) - 1; ++rnd)
+        {
+            kp = rnd_key(1);
+            round(inv_rnd, b1, b0, kp);
+            kp = rnd_key(1);
+            round(inv_rnd, b0, b1, kp);
+        }
+        kp = rnd_key(1);
+        round(inv_rnd, b1, b0, kp);
+#else
+    {   uint_32t    rnd;
+        for(rnd = 0; rnd < (cx->inf.b[0] >> 4) - 1; ++rnd)
+        {
+            kp = rnd_key(1);
+            round(inv_rnd, b1, b0, kp);
+            l_copy(b0, b1);
+        }
+#endif
+        kp = rnd_key(1);
+        round(inv_lrnd, b0, b1, kp);
+        }
+#endif
+
+    state_out(out, b0);
+
+#if defined( AES_ERR_CHK )
+    return EXIT_SUCCESS;
+#endif
+}
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/src/Crypto/Aeskey.c b/src/Crypto/Aeskey.c
index 948b9238..c9ab0269 100644
--- a/src/Crypto/Aeskey.c
+++ b/src/Crypto/Aeskey.c
@@ -1,573 +1,573 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 20/12/2007
-*/
-
-#include "Aesopt.h"
-#include "Aestab.h"
-
-#ifdef USE_VIA_ACE_IF_PRESENT
-#  include "aes_via_ace.h"
-#endif
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-/* Initialise the key schedule from the user supplied key. The key
-   length can be specified in bytes, with legal values of 16, 24
-   and 32, or in bits, with legal values of 128, 192 and 256. These
-   values correspond with Nk values of 4, 6 and 8 respectively.
-
-   The following macros implement a single cycle in the key
-   schedule generation process. The number of cycles needed
-   for each cx->n_col and nk value is:
-
-    nk =             4  5  6  7  8
-    ------------------------------
-    cx->n_col = 4   10  9  8  7  7
-    cx->n_col = 5   14 11 10  9  9
-    cx->n_col = 6   19 15 12 11 11
-    cx->n_col = 7   21 19 16 13 14
-    cx->n_col = 8   29 23 19 17 14
-*/
-
-#if (FUNCS_IN_C & ENC_KEYING_IN_C)
-
-#if defined(AES_128) || defined(AES_VAR)
-
-#define ke4(k,i) \
-{   k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; \
-    k[4*(i)+5] = ss[1] ^= ss[0]; \
-    k[4*(i)+6] = ss[2] ^= ss[1]; \
-    k[4*(i)+7] = ss[3] ^= ss[2]; \
-}
-
-AES_RETURN aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1])
-{   uint_32t    ss[4];
-
-    cx->ks[0] = ss[0] = word_in(key, 0);
-    cx->ks[1] = ss[1] = word_in(key, 1);
-    cx->ks[2] = ss[2] = word_in(key, 2);
-    cx->ks[3] = ss[3] = word_in(key, 3);
-
-#if ENC_UNROLL == NONE
-    {   uint_32t i;
-        for(i = 0; i < 9; ++i)
-            ke4(cx->ks, i);
-    }
-#else
-    ke4(cx->ks, 0);  ke4(cx->ks, 1);
-    ke4(cx->ks, 2);  ke4(cx->ks, 3);
-    ke4(cx->ks, 4);  ke4(cx->ks, 5);
-    ke4(cx->ks, 6);  ke4(cx->ks, 7);
-    ke4(cx->ks, 8);
-#endif
-    ke4(cx->ks, 9);
-    cx->inf.l = 0;
-    cx->inf.b[0] = 10 * 16;
-
-#ifdef USE_VIA_ACE_IF_PRESENT
-    if(VIA_ACE_AVAILABLE)
-        cx->inf.b[1] = 0xff;
-#endif
-
-#if defined( AES_ERR_CHK )
-    return EXIT_SUCCESS;
-#endif
-}
-
-#endif
-
-#if defined(AES_192) || defined(AES_VAR)
-
-#define kef6(k,i) \
-{   k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; \
-    k[6*(i)+ 7] = ss[1] ^= ss[0]; \
-    k[6*(i)+ 8] = ss[2] ^= ss[1]; \
-    k[6*(i)+ 9] = ss[3] ^= ss[2]; \
-}
-
-#define ke6(k,i) \
-{   kef6(k,i); \
-    k[6*(i)+10] = ss[4] ^= ss[3]; \
-    k[6*(i)+11] = ss[5] ^= ss[4]; \
-}
-
-AES_RETURN aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1])
-{   uint_32t    ss[6];
-
-    cx->ks[0] = ss[0] = word_in(key, 0);
-    cx->ks[1] = ss[1] = word_in(key, 1);
-    cx->ks[2] = ss[2] = word_in(key, 2);
-    cx->ks[3] = ss[3] = word_in(key, 3);
-    cx->ks[4] = ss[4] = word_in(key, 4);
-    cx->ks[5] = ss[5] = word_in(key, 5);
-
-#if ENC_UNROLL == NONE
-    {   uint_32t i;
-        for(i = 0; i < 7; ++i)
-            ke6(cx->ks, i);
-    }
-#else
-    ke6(cx->ks, 0);  ke6(cx->ks, 1);
-    ke6(cx->ks, 2);  ke6(cx->ks, 3);
-    ke6(cx->ks, 4);  ke6(cx->ks, 5);
-    ke6(cx->ks, 6);
-#endif
-    kef6(cx->ks, 7);
-    cx->inf.l = 0;
-    cx->inf.b[0] = 12 * 16;
-
-#ifdef USE_VIA_ACE_IF_PRESENT
-    if(VIA_ACE_AVAILABLE)
-        cx->inf.b[1] = 0xff;
-#endif
-
-#if defined( AES_ERR_CHK )
-    return EXIT_SUCCESS;
-#endif
-}
-
-#endif
-
-#if defined(AES_256) || defined(AES_VAR)
-
-#define kef8(k,i) \
-{   k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; \
-    k[8*(i)+ 9] = ss[1] ^= ss[0]; \
-    k[8*(i)+10] = ss[2] ^= ss[1]; \
-    k[8*(i)+11] = ss[3] ^= ss[2]; \
-}
-
-#define ke8(k,i) \
-{   kef8(k,i); \
-    k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); \
-    k[8*(i)+13] = ss[5] ^= ss[4]; \
-    k[8*(i)+14] = ss[6] ^= ss[5]; \
-    k[8*(i)+15] = ss[7] ^= ss[6]; \
-}
-
-AES_RETURN aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1])
-{   uint_32t    ss[8];
-
-    cx->ks[0] = ss[0] = word_in(key, 0);
-    cx->ks[1] = ss[1] = word_in(key, 1);
-    cx->ks[2] = ss[2] = word_in(key, 2);
-    cx->ks[3] = ss[3] = word_in(key, 3);
-    cx->ks[4] = ss[4] = word_in(key, 4);
-    cx->ks[5] = ss[5] = word_in(key, 5);
-    cx->ks[6] = ss[6] = word_in(key, 6);
-    cx->ks[7] = ss[7] = word_in(key, 7);
-
-#if ENC_UNROLL == NONE
-    {   uint_32t i;
-        for(i = 0; i < 6; ++i)
-            ke8(cx->ks,  i);
-    }
-#else
-    ke8(cx->ks, 0); ke8(cx->ks, 1);
-    ke8(cx->ks, 2); ke8(cx->ks, 3);
-    ke8(cx->ks, 4); ke8(cx->ks, 5);
-#endif
-    kef8(cx->ks, 6);
-    cx->inf.l = 0;
-    cx->inf.b[0] = 14 * 16;
-
-#ifdef USE_VIA_ACE_IF_PRESENT
-    if(VIA_ACE_AVAILABLE)
-        cx->inf.b[1] = 0xff;
-#endif
-
-#if defined( AES_ERR_CHK )
-    return EXIT_SUCCESS;
-#endif
-}
-
-#endif
-
-#if defined(AES_VAR)
-
-AES_RETURN aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1])
-{
-    switch(key_len)
-    {
-#if defined( AES_ERR_CHK )
-    case 16: case 128: return aes_encrypt_key128(key, cx);
-    case 24: case 192: return aes_encrypt_key192(key, cx);
-    case 32: case 256: return aes_encrypt_key256(key, cx);
-    default: return EXIT_FAILURE;
-#else
-    case 16: case 128: aes_encrypt_key128(key, cx); return;
-    case 24: case 192: aes_encrypt_key192(key, cx); return;
-    case 32: case 256: aes_encrypt_key256(key, cx); return;
-#endif
-    }
-}
-
-#endif
-
-#endif
-
-#if (FUNCS_IN_C & DEC_KEYING_IN_C)
-
-/* this is used to store the decryption round keys  */
-/* in forward or reverse order                      */
-
-#ifdef AES_REV_DKS
-#define v(n,i)  ((n) - (i) + 2 * ((i) & 3))
-#else
-#define v(n,i)  (i)
-#endif
-
-#if DEC_ROUND == NO_TABLES
-#define ff(x)   (x)
-#else
-#define ff(x)   inv_mcol(x)
-#if defined( dec_imvars )
-#define d_vars  dec_imvars
-#endif
-#endif
-
-#if defined(AES_128) || defined(AES_VAR)
-
-#define k4e(k,i) \
-{   k[v(40,(4*(i))+4)] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; \
-    k[v(40,(4*(i))+5)] = ss[1] ^= ss[0]; \
-    k[v(40,(4*(i))+6)] = ss[2] ^= ss[1]; \
-    k[v(40,(4*(i))+7)] = ss[3] ^= ss[2]; \
-}
-
-#if 1
-
-#define kdf4(k,i) \
-{   ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \
-    ss[1] = ss[1] ^ ss[3]; \
-    ss[2] = ss[2] ^ ss[3]; \
-    ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; \
-    ss[i % 4] ^= ss[4]; \
-    ss[4] ^= k[v(40,(4*(i)))];   k[v(40,(4*(i))+4)] = ff(ss[4]); \
-    ss[4] ^= k[v(40,(4*(i))+1)]; k[v(40,(4*(i))+5)] = ff(ss[4]); \
-    ss[4] ^= k[v(40,(4*(i))+2)]; k[v(40,(4*(i))+6)] = ff(ss[4]); \
-    ss[4] ^= k[v(40,(4*(i))+3)]; k[v(40,(4*(i))+7)] = ff(ss[4]); \
-}
-
-#define kd4(k,i) \
-{   ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; \
-    ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \
-    k[v(40,(4*(i))+4)] = ss[4] ^= k[v(40,(4*(i)))]; \
-    k[v(40,(4*(i))+5)] = ss[4] ^= k[v(40,(4*(i))+1)]; \
-    k[v(40,(4*(i))+6)] = ss[4] ^= k[v(40,(4*(i))+2)]; \
-    k[v(40,(4*(i))+7)] = ss[4] ^= k[v(40,(4*(i))+3)]; \
-}
-
-#define kdl4(k,i) \
-{   ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \
-    k[v(40,(4*(i))+4)] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \
-    k[v(40,(4*(i))+5)] = ss[1] ^ ss[3]; \
-    k[v(40,(4*(i))+6)] = ss[0]; \
-    k[v(40,(4*(i))+7)] = ss[1]; \
-}
-
-#else
-
-#define kdf4(k,i) \
-{   ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[v(40,(4*(i))+ 4)] = ff(ss[0]); \
-    ss[1] ^= ss[0]; k[v(40,(4*(i))+ 5)] = ff(ss[1]); \
-    ss[2] ^= ss[1]; k[v(40,(4*(i))+ 6)] = ff(ss[2]); \
-    ss[3] ^= ss[2]; k[v(40,(4*(i))+ 7)] = ff(ss[3]); \
-}
-
-#define kd4(k,i) \
-{   ss[4] = ls_box(ss[3],3) ^ t_use(r,c)[i]; \
-    ss[0] ^= ss[4]; ss[4] = ff(ss[4]); k[v(40,(4*(i))+ 4)] = ss[4] ^= k[v(40,(4*(i)))]; \
-    ss[1] ^= ss[0]; k[v(40,(4*(i))+ 5)] = ss[4] ^= k[v(40,(4*(i))+ 1)]; \
-    ss[2] ^= ss[1]; k[v(40,(4*(i))+ 6)] = ss[4] ^= k[v(40,(4*(i))+ 2)]; \
-    ss[3] ^= ss[2]; k[v(40,(4*(i))+ 7)] = ss[4] ^= k[v(40,(4*(i))+ 3)]; \
-}
-
-#define kdl4(k,i) \
-{   ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[v(40,(4*(i))+ 4)] = ss[0]; \
-    ss[1] ^= ss[0]; k[v(40,(4*(i))+ 5)] = ss[1]; \
-    ss[2] ^= ss[1]; k[v(40,(4*(i))+ 6)] = ss[2]; \
-    ss[3] ^= ss[2]; k[v(40,(4*(i))+ 7)] = ss[3]; \
-}
-
-#endif
-
-AES_RETURN aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1])
-{   uint_32t    ss[5];
-#if defined( d_vars )
-        d_vars;
-#endif
-    cx->ks[v(40,(0))] = ss[0] = word_in(key, 0);
-    cx->ks[v(40,(1))] = ss[1] = word_in(key, 1);
-    cx->ks[v(40,(2))] = ss[2] = word_in(key, 2);
-    cx->ks[v(40,(3))] = ss[3] = word_in(key, 3);
-
-#if DEC_UNROLL == NONE
-    {   uint_32t i;
-        for(i = 0; i < 10; ++i)
-            k4e(cx->ks, i);
-#if !(DEC_ROUND == NO_TABLES)
-        for(i = N_COLS; i < 10 * N_COLS; ++i)
-            cx->ks[i] = inv_mcol(cx->ks[i]);
-#endif
-    }
-#else
-    kdf4(cx->ks, 0);  kd4(cx->ks, 1);
-     kd4(cx->ks, 2);  kd4(cx->ks, 3);
-     kd4(cx->ks, 4);  kd4(cx->ks, 5);
-     kd4(cx->ks, 6);  kd4(cx->ks, 7);
-     kd4(cx->ks, 8); kdl4(cx->ks, 9);
-#endif
-    cx->inf.l = 0;
-    cx->inf.b[0] = 10 * 16;
-
-#ifdef USE_VIA_ACE_IF_PRESENT
-    if(VIA_ACE_AVAILABLE)
-        cx->inf.b[1] = 0xff;
-#endif
-
-#if defined( AES_ERR_CHK )
-    return EXIT_SUCCESS;
-#endif
-}
-
-#endif
-
-#if defined(AES_192) || defined(AES_VAR)
-
-#define k6ef(k,i) \
-{   k[v(48,(6*(i))+ 6)] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; \
-    k[v(48,(6*(i))+ 7)] = ss[1] ^= ss[0]; \
-    k[v(48,(6*(i))+ 8)] = ss[2] ^= ss[1]; \
-    k[v(48,(6*(i))+ 9)] = ss[3] ^= ss[2]; \
-}
-
-#define k6e(k,i) \
-{   k6ef(k,i); \
-    k[v(48,(6*(i))+10)] = ss[4] ^= ss[3]; \
-    k[v(48,(6*(i))+11)] = ss[5] ^= ss[4]; \
-}
-
-#define kdf6(k,i) \
-{   ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[v(48,(6*(i))+ 6)] = ff(ss[0]); \
-    ss[1] ^= ss[0]; k[v(48,(6*(i))+ 7)] = ff(ss[1]); \
-    ss[2] ^= ss[1]; k[v(48,(6*(i))+ 8)] = ff(ss[2]); \
-    ss[3] ^= ss[2]; k[v(48,(6*(i))+ 9)] = ff(ss[3]); \
-    ss[4] ^= ss[3]; k[v(48,(6*(i))+10)] = ff(ss[4]); \
-    ss[5] ^= ss[4]; k[v(48,(6*(i))+11)] = ff(ss[5]); \
-}
-
-#define kd6(k,i) \
-{   ss[6] = ls_box(ss[5],3) ^ t_use(r,c)[i]; \
-    ss[0] ^= ss[6]; ss[6] = ff(ss[6]); k[v(48,(6*(i))+ 6)] = ss[6] ^= k[v(48,(6*(i)))]; \
-    ss[1] ^= ss[0]; k[v(48,(6*(i))+ 7)] = ss[6] ^= k[v(48,(6*(i))+ 1)]; \
-    ss[2] ^= ss[1]; k[v(48,(6*(i))+ 8)] = ss[6] ^= k[v(48,(6*(i))+ 2)]; \
-    ss[3] ^= ss[2]; k[v(48,(6*(i))+ 9)] = ss[6] ^= k[v(48,(6*(i))+ 3)]; \
-    ss[4] ^= ss[3]; k[v(48,(6*(i))+10)] = ss[6] ^= k[v(48,(6*(i))+ 4)]; \
-    ss[5] ^= ss[4]; k[v(48,(6*(i))+11)] = ss[6] ^= k[v(48,(6*(i))+ 5)]; \
-}
-
-#define kdl6(k,i) \
-{   ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[v(48,(6*(i))+ 6)] = ss[0]; \
-    ss[1] ^= ss[0]; k[v(48,(6*(i))+ 7)] = ss[1]; \
-    ss[2] ^= ss[1]; k[v(48,(6*(i))+ 8)] = ss[2]; \
-    ss[3] ^= ss[2]; k[v(48,(6*(i))+ 9)] = ss[3]; \
-}
-
-AES_RETURN aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1])
-{   uint_32t    ss[7];
-#if defined( d_vars )
-        d_vars;
-#endif
-    cx->ks[v(48,(0))] = ss[0] = word_in(key, 0);
-    cx->ks[v(48,(1))] = ss[1] = word_in(key, 1);
-    cx->ks[v(48,(2))] = ss[2] = word_in(key, 2);
-    cx->ks[v(48,(3))] = ss[3] = word_in(key, 3);
-
-#if DEC_UNROLL == NONE
-    cx->ks[v(48,(4))] = ss[4] = word_in(key, 4);
-    cx->ks[v(48,(5))] = ss[5] = word_in(key, 5);
-    {   uint_32t i;
-
-        for(i = 0; i < 7; ++i)
-            k6e(cx->ks, i);
-        k6ef(cx->ks, 7);
-#if !(DEC_ROUND == NO_TABLES)
-        for(i = N_COLS; i < 12 * N_COLS; ++i)
-            cx->ks[i] = inv_mcol(cx->ks[i]);
-#endif
-    }
-#else
-    cx->ks[v(48,(4))] = ff(ss[4] = word_in(key, 4));
-    cx->ks[v(48,(5))] = ff(ss[5] = word_in(key, 5));
-    kdf6(cx->ks, 0); kd6(cx->ks, 1);
-    kd6(cx->ks, 2);  kd6(cx->ks, 3);
-    kd6(cx->ks, 4);  kd6(cx->ks, 5);
-    kd6(cx->ks, 6); kdl6(cx->ks, 7);
-#endif
-    cx->inf.l = 0;
-    cx->inf.b[0] = 12 * 16;
-
-#ifdef USE_VIA_ACE_IF_PRESENT
-    if(VIA_ACE_AVAILABLE)
-        cx->inf.b[1] = 0xff;
-#endif
-
-#if defined( AES_ERR_CHK )
-    return EXIT_SUCCESS;
-#endif
-}
-
-#endif
-
-#if defined(AES_256) || defined(AES_VAR)
-
-#define k8ef(k,i) \
-{   k[v(56,(8*(i))+ 8)] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; \
-    k[v(56,(8*(i))+ 9)] = ss[1] ^= ss[0]; \
-    k[v(56,(8*(i))+10)] = ss[2] ^= ss[1]; \
-    k[v(56,(8*(i))+11)] = ss[3] ^= ss[2]; \
-}
-
-#define k8e(k,i) \
-{   k8ef(k,i); \
-    k[v(56,(8*(i))+12)] = ss[4] ^= ls_box(ss[3],0); \
-    k[v(56,(8*(i))+13)] = ss[5] ^= ss[4]; \
-    k[v(56,(8*(i))+14)] = ss[6] ^= ss[5]; \
-    k[v(56,(8*(i))+15)] = ss[7] ^= ss[6]; \
-}
-
-#define kdf8(k,i) \
-{   ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[v(56,(8*(i))+ 8)] = ff(ss[0]); \
-    ss[1] ^= ss[0]; k[v(56,(8*(i))+ 9)] = ff(ss[1]); \
-    ss[2] ^= ss[1]; k[v(56,(8*(i))+10)] = ff(ss[2]); \
-    ss[3] ^= ss[2]; k[v(56,(8*(i))+11)] = ff(ss[3]); \
-    ss[4] ^= ls_box(ss[3],0); k[v(56,(8*(i))+12)] = ff(ss[4]); \
-    ss[5] ^= ss[4]; k[v(56,(8*(i))+13)] = ff(ss[5]); \
-    ss[6] ^= ss[5]; k[v(56,(8*(i))+14)] = ff(ss[6]); \
-    ss[7] ^= ss[6]; k[v(56,(8*(i))+15)] = ff(ss[7]); \
-}
-
-#define kd8(k,i) \
-{   ss[8] = ls_box(ss[7],3) ^ t_use(r,c)[i]; \
-    ss[0] ^= ss[8]; ss[8] = ff(ss[8]); k[v(56,(8*(i))+ 8)] = ss[8] ^= k[v(56,(8*(i)))]; \
-    ss[1] ^= ss[0]; k[v(56,(8*(i))+ 9)] = ss[8] ^= k[v(56,(8*(i))+ 1)]; \
-    ss[2] ^= ss[1]; k[v(56,(8*(i))+10)] = ss[8] ^= k[v(56,(8*(i))+ 2)]; \
-    ss[3] ^= ss[2]; k[v(56,(8*(i))+11)] = ss[8] ^= k[v(56,(8*(i))+ 3)]; \
-    ss[8] = ls_box(ss[3],0); \
-    ss[4] ^= ss[8]; ss[8] = ff(ss[8]); k[v(56,(8*(i))+12)] = ss[8] ^= k[v(56,(8*(i))+ 4)]; \
-    ss[5] ^= ss[4]; k[v(56,(8*(i))+13)] = ss[8] ^= k[v(56,(8*(i))+ 5)]; \
-    ss[6] ^= ss[5]; k[v(56,(8*(i))+14)] = ss[8] ^= k[v(56,(8*(i))+ 6)]; \
-    ss[7] ^= ss[6]; k[v(56,(8*(i))+15)] = ss[8] ^= k[v(56,(8*(i))+ 7)]; \
-}
-
-#define kdl8(k,i) \
-{   ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[v(56,(8*(i))+ 8)] = ss[0]; \
-    ss[1] ^= ss[0]; k[v(56,(8*(i))+ 9)] = ss[1]; \
-    ss[2] ^= ss[1]; k[v(56,(8*(i))+10)] = ss[2]; \
-    ss[3] ^= ss[2]; k[v(56,(8*(i))+11)] = ss[3]; \
-}
-
-AES_RETURN aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1])
-{   uint_32t    ss[9];
-#if defined( d_vars )
-        d_vars;
-#endif
-    cx->ks[v(56,(0))] = ss[0] = word_in(key, 0);
-    cx->ks[v(56,(1))] = ss[1] = word_in(key, 1);
-    cx->ks[v(56,(2))] = ss[2] = word_in(key, 2);
-    cx->ks[v(56,(3))] = ss[3] = word_in(key, 3);
-
-#if DEC_UNROLL == NONE
-    cx->ks[v(56,(4))] = ss[4] = word_in(key, 4);
-    cx->ks[v(56,(5))] = ss[5] = word_in(key, 5);
-    cx->ks[v(56,(6))] = ss[6] = word_in(key, 6);
-    cx->ks[v(56,(7))] = ss[7] = word_in(key, 7);
-    {   uint_32t i;
-
-        for(i = 0; i < 6; ++i)
-            k8e(cx->ks,  i);
-        k8ef(cx->ks,  6);
-#if !(DEC_ROUND == NO_TABLES)
-        for(i = N_COLS; i < 14 * N_COLS; ++i)
-            cx->ks[i] = inv_mcol(cx->ks[i]);
-
-#endif
-    }
-#else
-    ss[4] = word_in(key, 4); cx->ks[v(56,(4))] = ff(ss[4]);
-    ss[5] = word_in(key, 5); cx->ks[v(56,(5))] = ff(ss[5]);
-    ss[6] = word_in(key, 6); cx->ks[v(56,(6))] = ff(ss[6]);
-    ss[7] = word_in(key, 7); cx->ks[v(56,(7))] = ff(ss[7]);
-    kdf8(cx->ks, 0); kd8(cx->ks, 1);
-    kd8(cx->ks, 2);  kd8(cx->ks, 3);
-    kd8(cx->ks, 4);  kd8(cx->ks, 5);
-    kdl8(cx->ks, 6);
-#endif
-    cx->inf.l = 0;
-    cx->inf.b[0] = 14 * 16;
-
-#ifdef USE_VIA_ACE_IF_PRESENT
-    if(VIA_ACE_AVAILABLE)
-        cx->inf.b[1] = 0xff;
-#endif
-
-#if defined( AES_ERR_CHK )
-    return EXIT_SUCCESS;
-#endif
-}
-
-#endif
-
-#if defined(AES_VAR)
-
-AES_RETURN aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1])
-{
-    switch(key_len)
-    {
-#if defined( AES_ERR_CHK )
-    case 16: case 128: return aes_decrypt_key128(key, cx);
-    case 24: case 192: return aes_decrypt_key192(key, cx);
-    case 32: case 256: return aes_decrypt_key256(key, cx);
-    default: return EXIT_FAILURE;
-#else
-    case 16: case 128: aes_decrypt_key128(key, cx); return;
-    case 24: case 192: aes_decrypt_key192(key, cx); return;
-    case 32: case 256: aes_decrypt_key256(key, cx); return;
-#endif
-    }
-}
-
-#endif
-
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+*/
+
+#include "Aesopt.h"
+#include "Aestab.h"
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+#  include "aes_via_ace.h"
+#endif
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+/* Initialise the key schedule from the user supplied key. The key
+   length can be specified in bytes, with legal values of 16, 24
+   and 32, or in bits, with legal values of 128, 192 and 256. These
+   values correspond with Nk values of 4, 6 and 8 respectively.
+
+   The following macros implement a single cycle in the key
+   schedule generation process. The number of cycles needed
+   for each cx->n_col and nk value is:
+
+    nk =             4  5  6  7  8
+    ------------------------------
+    cx->n_col = 4   10  9  8  7  7
+    cx->n_col = 5   14 11 10  9  9
+    cx->n_col = 6   19 15 12 11 11
+    cx->n_col = 7   21 19 16 13 14
+    cx->n_col = 8   29 23 19 17 14
+*/
+
+#if (FUNCS_IN_C & ENC_KEYING_IN_C)
+
+#if defined(AES_128) || defined(AES_VAR)
+
+#define ke4(k,i) \
+{   k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; \
+    k[4*(i)+5] = ss[1] ^= ss[0]; \
+    k[4*(i)+6] = ss[2] ^= ss[1]; \
+    k[4*(i)+7] = ss[3] ^= ss[2]; \
+}
+
+AES_RETURN aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1])
+{   uint_32t    ss[4];
+
+    cx->ks[0] = ss[0] = word_in(key, 0);
+    cx->ks[1] = ss[1] = word_in(key, 1);
+    cx->ks[2] = ss[2] = word_in(key, 2);
+    cx->ks[3] = ss[3] = word_in(key, 3);
+
+#if ENC_UNROLL == NONE
+    {   uint_32t i;
+        for(i = 0; i < 9; ++i)
+            ke4(cx->ks, i);
+    }
+#else
+    ke4(cx->ks, 0);  ke4(cx->ks, 1);
+    ke4(cx->ks, 2);  ke4(cx->ks, 3);
+    ke4(cx->ks, 4);  ke4(cx->ks, 5);
+    ke4(cx->ks, 6);  ke4(cx->ks, 7);
+    ke4(cx->ks, 8);
+#endif
+    ke4(cx->ks, 9);
+    cx->inf.l = 0;
+    cx->inf.b[0] = 10 * 16;
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+
+#if defined( AES_ERR_CHK )
+    return EXIT_SUCCESS;
+#endif
+}
+
+#endif
+
+#if defined(AES_192) || defined(AES_VAR)
+
+#define kef6(k,i) \
+{   k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; \
+    k[6*(i)+ 7] = ss[1] ^= ss[0]; \
+    k[6*(i)+ 8] = ss[2] ^= ss[1]; \
+    k[6*(i)+ 9] = ss[3] ^= ss[2]; \
+}
+
+#define ke6(k,i) \
+{   kef6(k,i); \
+    k[6*(i)+10] = ss[4] ^= ss[3]; \
+    k[6*(i)+11] = ss[5] ^= ss[4]; \
+}
+
+AES_RETURN aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1])
+{   uint_32t    ss[6];
+
+    cx->ks[0] = ss[0] = word_in(key, 0);
+    cx->ks[1] = ss[1] = word_in(key, 1);
+    cx->ks[2] = ss[2] = word_in(key, 2);
+    cx->ks[3] = ss[3] = word_in(key, 3);
+    cx->ks[4] = ss[4] = word_in(key, 4);
+    cx->ks[5] = ss[5] = word_in(key, 5);
+
+#if ENC_UNROLL == NONE
+    {   uint_32t i;
+        for(i = 0; i < 7; ++i)
+            ke6(cx->ks, i);
+    }
+#else
+    ke6(cx->ks, 0);  ke6(cx->ks, 1);
+    ke6(cx->ks, 2);  ke6(cx->ks, 3);
+    ke6(cx->ks, 4);  ke6(cx->ks, 5);
+    ke6(cx->ks, 6);
+#endif
+    kef6(cx->ks, 7);
+    cx->inf.l = 0;
+    cx->inf.b[0] = 12 * 16;
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+
+#if defined( AES_ERR_CHK )
+    return EXIT_SUCCESS;
+#endif
+}
+
+#endif
+
+#if defined(AES_256) || defined(AES_VAR)
+
+#define kef8(k,i) \
+{   k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; \
+    k[8*(i)+ 9] = ss[1] ^= ss[0]; \
+    k[8*(i)+10] = ss[2] ^= ss[1]; \
+    k[8*(i)+11] = ss[3] ^= ss[2]; \
+}
+
+#define ke8(k,i) \
+{   kef8(k,i); \
+    k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); \
+    k[8*(i)+13] = ss[5] ^= ss[4]; \
+    k[8*(i)+14] = ss[6] ^= ss[5]; \
+    k[8*(i)+15] = ss[7] ^= ss[6]; \
+}
+
+AES_RETURN aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1])
+{   uint_32t    ss[8];
+
+    cx->ks[0] = ss[0] = word_in(key, 0);
+    cx->ks[1] = ss[1] = word_in(key, 1);
+    cx->ks[2] = ss[2] = word_in(key, 2);
+    cx->ks[3] = ss[3] = word_in(key, 3);
+    cx->ks[4] = ss[4] = word_in(key, 4);
+    cx->ks[5] = ss[5] = word_in(key, 5);
+    cx->ks[6] = ss[6] = word_in(key, 6);
+    cx->ks[7] = ss[7] = word_in(key, 7);
+
+#if ENC_UNROLL == NONE
+    {   uint_32t i;
+        for(i = 0; i < 6; ++i)
+            ke8(cx->ks,  i);
+    }
+#else
+    ke8(cx->ks, 0); ke8(cx->ks, 1);
+    ke8(cx->ks, 2); ke8(cx->ks, 3);
+    ke8(cx->ks, 4); ke8(cx->ks, 5);
+#endif
+    kef8(cx->ks, 6);
+    cx->inf.l = 0;
+    cx->inf.b[0] = 14 * 16;
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+
+#if defined( AES_ERR_CHK )
+    return EXIT_SUCCESS;
+#endif
+}
+
+#endif
+
+#if defined(AES_VAR)
+
+AES_RETURN aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1])
+{
+    switch(key_len)
+    {
+#if defined( AES_ERR_CHK )
+    case 16: case 128: return aes_encrypt_key128(key, cx);
+    case 24: case 192: return aes_encrypt_key192(key, cx);
+    case 32: case 256: return aes_encrypt_key256(key, cx);
+    default: return EXIT_FAILURE;
+#else
+    case 16: case 128: aes_encrypt_key128(key, cx); return;
+    case 24: case 192: aes_encrypt_key192(key, cx); return;
+    case 32: case 256: aes_encrypt_key256(key, cx); return;
+#endif
+    }
+}
+
+#endif
+
+#endif
+
+#if (FUNCS_IN_C & DEC_KEYING_IN_C)
+
+/* this is used to store the decryption round keys  */
+/* in forward or reverse order                      */
+
+#ifdef AES_REV_DKS
+#define v(n,i)  ((n) - (i) + 2 * ((i) & 3))
+#else
+#define v(n,i)  (i)
+#endif
+
+#if DEC_ROUND == NO_TABLES
+#define ff(x)   (x)
+#else
+#define ff(x)   inv_mcol(x)
+#if defined( dec_imvars )
+#define d_vars  dec_imvars
+#endif
+#endif
+
+#if defined(AES_128) || defined(AES_VAR)
+
+#define k4e(k,i) \
+{   k[v(40,(4*(i))+4)] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; \
+    k[v(40,(4*(i))+5)] = ss[1] ^= ss[0]; \
+    k[v(40,(4*(i))+6)] = ss[2] ^= ss[1]; \
+    k[v(40,(4*(i))+7)] = ss[3] ^= ss[2]; \
+}
+
+#if 1
+
+#define kdf4(k,i) \
+{   ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \
+    ss[1] = ss[1] ^ ss[3]; \
+    ss[2] = ss[2] ^ ss[3]; \
+    ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; \
+    ss[i % 4] ^= ss[4]; \
+    ss[4] ^= k[v(40,(4*(i)))];   k[v(40,(4*(i))+4)] = ff(ss[4]); \
+    ss[4] ^= k[v(40,(4*(i))+1)]; k[v(40,(4*(i))+5)] = ff(ss[4]); \
+    ss[4] ^= k[v(40,(4*(i))+2)]; k[v(40,(4*(i))+6)] = ff(ss[4]); \
+    ss[4] ^= k[v(40,(4*(i))+3)]; k[v(40,(4*(i))+7)] = ff(ss[4]); \
+}
+
+#define kd4(k,i) \
+{   ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; \
+    ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \
+    k[v(40,(4*(i))+4)] = ss[4] ^= k[v(40,(4*(i)))]; \
+    k[v(40,(4*(i))+5)] = ss[4] ^= k[v(40,(4*(i))+1)]; \
+    k[v(40,(4*(i))+6)] = ss[4] ^= k[v(40,(4*(i))+2)]; \
+    k[v(40,(4*(i))+7)] = ss[4] ^= k[v(40,(4*(i))+3)]; \
+}
+
+#define kdl4(k,i) \
+{   ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \
+    k[v(40,(4*(i))+4)] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \
+    k[v(40,(4*(i))+5)] = ss[1] ^ ss[3]; \
+    k[v(40,(4*(i))+6)] = ss[0]; \
+    k[v(40,(4*(i))+7)] = ss[1]; \
+}
+
+#else
+
+#define kdf4(k,i) \
+{   ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[v(40,(4*(i))+ 4)] = ff(ss[0]); \
+    ss[1] ^= ss[0]; k[v(40,(4*(i))+ 5)] = ff(ss[1]); \
+    ss[2] ^= ss[1]; k[v(40,(4*(i))+ 6)] = ff(ss[2]); \
+    ss[3] ^= ss[2]; k[v(40,(4*(i))+ 7)] = ff(ss[3]); \
+}
+
+#define kd4(k,i) \
+{   ss[4] = ls_box(ss[3],3) ^ t_use(r,c)[i]; \
+    ss[0] ^= ss[4]; ss[4] = ff(ss[4]); k[v(40,(4*(i))+ 4)] = ss[4] ^= k[v(40,(4*(i)))]; \
+    ss[1] ^= ss[0]; k[v(40,(4*(i))+ 5)] = ss[4] ^= k[v(40,(4*(i))+ 1)]; \
+    ss[2] ^= ss[1]; k[v(40,(4*(i))+ 6)] = ss[4] ^= k[v(40,(4*(i))+ 2)]; \
+    ss[3] ^= ss[2]; k[v(40,(4*(i))+ 7)] = ss[4] ^= k[v(40,(4*(i))+ 3)]; \
+}
+
+#define kdl4(k,i) \
+{   ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[v(40,(4*(i))+ 4)] = ss[0]; \
+    ss[1] ^= ss[0]; k[v(40,(4*(i))+ 5)] = ss[1]; \
+    ss[2] ^= ss[1]; k[v(40,(4*(i))+ 6)] = ss[2]; \
+    ss[3] ^= ss[2]; k[v(40,(4*(i))+ 7)] = ss[3]; \
+}
+
+#endif
+
+AES_RETURN aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1])
+{   uint_32t    ss[5];
+#if defined( d_vars )
+        d_vars;
+#endif
+    cx->ks[v(40,(0))] = ss[0] = word_in(key, 0);
+    cx->ks[v(40,(1))] = ss[1] = word_in(key, 1);
+    cx->ks[v(40,(2))] = ss[2] = word_in(key, 2);
+    cx->ks[v(40,(3))] = ss[3] = word_in(key, 3);
+
+#if DEC_UNROLL == NONE
+    {   uint_32t i;
+        for(i = 0; i < 10; ++i)
+            k4e(cx->ks, i);
+#if !(DEC_ROUND == NO_TABLES)
+        for(i = N_COLS; i < 10 * N_COLS; ++i)
+            cx->ks[i] = inv_mcol(cx->ks[i]);
+#endif
+    }
+#else
+    kdf4(cx->ks, 0);  kd4(cx->ks, 1);
+     kd4(cx->ks, 2);  kd4(cx->ks, 3);
+     kd4(cx->ks, 4);  kd4(cx->ks, 5);
+     kd4(cx->ks, 6);  kd4(cx->ks, 7);
+     kd4(cx->ks, 8); kdl4(cx->ks, 9);
+#endif
+    cx->inf.l = 0;
+    cx->inf.b[0] = 10 * 16;
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+
+#if defined( AES_ERR_CHK )
+    return EXIT_SUCCESS;
+#endif
+}
+
+#endif
+
+#if defined(AES_192) || defined(AES_VAR)
+
+#define k6ef(k,i) \
+{   k[v(48,(6*(i))+ 6)] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; \
+    k[v(48,(6*(i))+ 7)] = ss[1] ^= ss[0]; \
+    k[v(48,(6*(i))+ 8)] = ss[2] ^= ss[1]; \
+    k[v(48,(6*(i))+ 9)] = ss[3] ^= ss[2]; \
+}
+
+#define k6e(k,i) \
+{   k6ef(k,i); \
+    k[v(48,(6*(i))+10)] = ss[4] ^= ss[3]; \
+    k[v(48,(6*(i))+11)] = ss[5] ^= ss[4]; \
+}
+
+#define kdf6(k,i) \
+{   ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[v(48,(6*(i))+ 6)] = ff(ss[0]); \
+    ss[1] ^= ss[0]; k[v(48,(6*(i))+ 7)] = ff(ss[1]); \
+    ss[2] ^= ss[1]; k[v(48,(6*(i))+ 8)] = ff(ss[2]); \
+    ss[3] ^= ss[2]; k[v(48,(6*(i))+ 9)] = ff(ss[3]); \
+    ss[4] ^= ss[3]; k[v(48,(6*(i))+10)] = ff(ss[4]); \
+    ss[5] ^= ss[4]; k[v(48,(6*(i))+11)] = ff(ss[5]); \
+}
+
+#define kd6(k,i) \
+{   ss[6] = ls_box(ss[5],3) ^ t_use(r,c)[i]; \
+    ss[0] ^= ss[6]; ss[6] = ff(ss[6]); k[v(48,(6*(i))+ 6)] = ss[6] ^= k[v(48,(6*(i)))]; \
+    ss[1] ^= ss[0]; k[v(48,(6*(i))+ 7)] = ss[6] ^= k[v(48,(6*(i))+ 1)]; \
+    ss[2] ^= ss[1]; k[v(48,(6*(i))+ 8)] = ss[6] ^= k[v(48,(6*(i))+ 2)]; \
+    ss[3] ^= ss[2]; k[v(48,(6*(i))+ 9)] = ss[6] ^= k[v(48,(6*(i))+ 3)]; \
+    ss[4] ^= ss[3]; k[v(48,(6*(i))+10)] = ss[6] ^= k[v(48,(6*(i))+ 4)]; \
+    ss[5] ^= ss[4]; k[v(48,(6*(i))+11)] = ss[6] ^= k[v(48,(6*(i))+ 5)]; \
+}
+
+#define kdl6(k,i) \
+{   ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[v(48,(6*(i))+ 6)] = ss[0]; \
+    ss[1] ^= ss[0]; k[v(48,(6*(i))+ 7)] = ss[1]; \
+    ss[2] ^= ss[1]; k[v(48,(6*(i))+ 8)] = ss[2]; \
+    ss[3] ^= ss[2]; k[v(48,(6*(i))+ 9)] = ss[3]; \
+}
+
+AES_RETURN aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1])
+{   uint_32t    ss[7];
+#if defined( d_vars )
+        d_vars;
+#endif
+    cx->ks[v(48,(0))] = ss[0] = word_in(key, 0);
+    cx->ks[v(48,(1))] = ss[1] = word_in(key, 1);
+    cx->ks[v(48,(2))] = ss[2] = word_in(key, 2);
+    cx->ks[v(48,(3))] = ss[3] = word_in(key, 3);
+
+#if DEC_UNROLL == NONE
+    cx->ks[v(48,(4))] = ss[4] = word_in(key, 4);
+    cx->ks[v(48,(5))] = ss[5] = word_in(key, 5);
+    {   uint_32t i;
+
+        for(i = 0; i < 7; ++i)
+            k6e(cx->ks, i);
+        k6ef(cx->ks, 7);
+#if !(DEC_ROUND == NO_TABLES)
+        for(i = N_COLS; i < 12 * N_COLS; ++i)
+            cx->ks[i] = inv_mcol(cx->ks[i]);
+#endif
+    }
+#else
+    cx->ks[v(48,(4))] = ff(ss[4] = word_in(key, 4));
+    cx->ks[v(48,(5))] = ff(ss[5] = word_in(key, 5));
+    kdf6(cx->ks, 0); kd6(cx->ks, 1);
+    kd6(cx->ks, 2);  kd6(cx->ks, 3);
+    kd6(cx->ks, 4);  kd6(cx->ks, 5);
+    kd6(cx->ks, 6); kdl6(cx->ks, 7);
+#endif
+    cx->inf.l = 0;
+    cx->inf.b[0] = 12 * 16;
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+
+#if defined( AES_ERR_CHK )
+    return EXIT_SUCCESS;
+#endif
+}
+
+#endif
+
+#if defined(AES_256) || defined(AES_VAR)
+
+#define k8ef(k,i) \
+{   k[v(56,(8*(i))+ 8)] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; \
+    k[v(56,(8*(i))+ 9)] = ss[1] ^= ss[0]; \
+    k[v(56,(8*(i))+10)] = ss[2] ^= ss[1]; \
+    k[v(56,(8*(i))+11)] = ss[3] ^= ss[2]; \
+}
+
+#define k8e(k,i) \
+{   k8ef(k,i); \
+    k[v(56,(8*(i))+12)] = ss[4] ^= ls_box(ss[3],0); \
+    k[v(56,(8*(i))+13)] = ss[5] ^= ss[4]; \
+    k[v(56,(8*(i))+14)] = ss[6] ^= ss[5]; \
+    k[v(56,(8*(i))+15)] = ss[7] ^= ss[6]; \
+}
+
+#define kdf8(k,i) \
+{   ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[v(56,(8*(i))+ 8)] = ff(ss[0]); \
+    ss[1] ^= ss[0]; k[v(56,(8*(i))+ 9)] = ff(ss[1]); \
+    ss[2] ^= ss[1]; k[v(56,(8*(i))+10)] = ff(ss[2]); \
+    ss[3] ^= ss[2]; k[v(56,(8*(i))+11)] = ff(ss[3]); \
+    ss[4] ^= ls_box(ss[3],0); k[v(56,(8*(i))+12)] = ff(ss[4]); \
+    ss[5] ^= ss[4]; k[v(56,(8*(i))+13)] = ff(ss[5]); \
+    ss[6] ^= ss[5]; k[v(56,(8*(i))+14)] = ff(ss[6]); \
+    ss[7] ^= ss[6]; k[v(56,(8*(i))+15)] = ff(ss[7]); \
+}
+
+#define kd8(k,i) \
+{   ss[8] = ls_box(ss[7],3) ^ t_use(r,c)[i]; \
+    ss[0] ^= ss[8]; ss[8] = ff(ss[8]); k[v(56,(8*(i))+ 8)] = ss[8] ^= k[v(56,(8*(i)))]; \
+    ss[1] ^= ss[0]; k[v(56,(8*(i))+ 9)] = ss[8] ^= k[v(56,(8*(i))+ 1)]; \
+    ss[2] ^= ss[1]; k[v(56,(8*(i))+10)] = ss[8] ^= k[v(56,(8*(i))+ 2)]; \
+    ss[3] ^= ss[2]; k[v(56,(8*(i))+11)] = ss[8] ^= k[v(56,(8*(i))+ 3)]; \
+    ss[8] = ls_box(ss[3],0); \
+    ss[4] ^= ss[8]; ss[8] = ff(ss[8]); k[v(56,(8*(i))+12)] = ss[8] ^= k[v(56,(8*(i))+ 4)]; \
+    ss[5] ^= ss[4]; k[v(56,(8*(i))+13)] = ss[8] ^= k[v(56,(8*(i))+ 5)]; \
+    ss[6] ^= ss[5]; k[v(56,(8*(i))+14)] = ss[8] ^= k[v(56,(8*(i))+ 6)]; \
+    ss[7] ^= ss[6]; k[v(56,(8*(i))+15)] = ss[8] ^= k[v(56,(8*(i))+ 7)]; \
+}
+
+#define kdl8(k,i) \
+{   ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[v(56,(8*(i))+ 8)] = ss[0]; \
+    ss[1] ^= ss[0]; k[v(56,(8*(i))+ 9)] = ss[1]; \
+    ss[2] ^= ss[1]; k[v(56,(8*(i))+10)] = ss[2]; \
+    ss[3] ^= ss[2]; k[v(56,(8*(i))+11)] = ss[3]; \
+}
+
+AES_RETURN aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1])
+{   uint_32t    ss[9];
+#if defined( d_vars )
+        d_vars;
+#endif
+    cx->ks[v(56,(0))] = ss[0] = word_in(key, 0);
+    cx->ks[v(56,(1))] = ss[1] = word_in(key, 1);
+    cx->ks[v(56,(2))] = ss[2] = word_in(key, 2);
+    cx->ks[v(56,(3))] = ss[3] = word_in(key, 3);
+
+#if DEC_UNROLL == NONE
+    cx->ks[v(56,(4))] = ss[4] = word_in(key, 4);
+    cx->ks[v(56,(5))] = ss[5] = word_in(key, 5);
+    cx->ks[v(56,(6))] = ss[6] = word_in(key, 6);
+    cx->ks[v(56,(7))] = ss[7] = word_in(key, 7);
+    {   uint_32t i;
+
+        for(i = 0; i < 6; ++i)
+            k8e(cx->ks,  i);
+        k8ef(cx->ks,  6);
+#if !(DEC_ROUND == NO_TABLES)
+        for(i = N_COLS; i < 14 * N_COLS; ++i)
+            cx->ks[i] = inv_mcol(cx->ks[i]);
+
+#endif
+    }
+#else
+    ss[4] = word_in(key, 4); cx->ks[v(56,(4))] = ff(ss[4]);
+    ss[5] = word_in(key, 5); cx->ks[v(56,(5))] = ff(ss[5]);
+    ss[6] = word_in(key, 6); cx->ks[v(56,(6))] = ff(ss[6]);
+    ss[7] = word_in(key, 7); cx->ks[v(56,(7))] = ff(ss[7]);
+    kdf8(cx->ks, 0); kd8(cx->ks, 1);
+    kd8(cx->ks, 2);  kd8(cx->ks, 3);
+    kd8(cx->ks, 4);  kd8(cx->ks, 5);
+    kdl8(cx->ks, 6);
+#endif
+    cx->inf.l = 0;
+    cx->inf.b[0] = 14 * 16;
+
+#ifdef USE_VIA_ACE_IF_PRESENT
+    if(VIA_ACE_AVAILABLE)
+        cx->inf.b[1] = 0xff;
+#endif
+
+#if defined( AES_ERR_CHK )
+    return EXIT_SUCCESS;
+#endif
+}
+
+#endif
+
+#if defined(AES_VAR)
+
+AES_RETURN aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1])
+{
+    switch(key_len)
+    {
+#if defined( AES_ERR_CHK )
+    case 16: case 128: return aes_decrypt_key128(key, cx);
+    case 24: case 192: return aes_decrypt_key192(key, cx);
+    case 32: case 256: return aes_decrypt_key256(key, cx);
+    default: return EXIT_FAILURE;
+#else
+    case 16: case 128: aes_decrypt_key128(key, cx); return;
+    case 24: case 192: aes_decrypt_key192(key, cx); return;
+    case 32: case 256: aes_decrypt_key256(key, cx); return;
+#endif
+    }
+}
+
+#endif
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/src/Crypto/Aesopt.h b/src/Crypto/Aesopt.h
index 1b793e43..cf7edbe2 100644
--- a/src/Crypto/Aesopt.h
+++ b/src/Crypto/Aesopt.h
@@ -1,734 +1,734 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 20/12/2007
-
- This file contains the compilation options for AES (Rijndael) and code
- that is common across encryption, key scheduling and table generation.
-
- OPERATION
-
- These source code files implement the AES algorithm Rijndael designed by
- Joan Daemen and Vincent Rijmen. This version is designed for the standard
- block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24
- and 32 bytes).
-
- This version is designed for flexibility and speed using operations on
- 32-bit words rather than operations on bytes.  It can be compiled with
- either big or little endian internal byte order but is faster when the
- native byte order for the processor is used.
-
- THE CIPHER INTERFACE
-
- The cipher interface is implemented as an array of bytes in which lower
- AES bit sequence indexes map to higher numeric significance within bytes.
-
-  uint_8t                 (an unsigned  8-bit type)
-  uint_32t                (an unsigned 32-bit type)
-  struct aes_encrypt_ctx  (structure for the cipher encryption context)
-  struct aes_decrypt_ctx  (structure for the cipher decryption context)
-  AES_RETURN                the function return type
-
-  C subroutine calls:
-
-  AES_RETURN aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]);
-  AES_RETURN aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]);
-  AES_RETURN aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]);
-  AES_RETURN aes_encrypt(const unsigned char *in, unsigned char *out,
-                                                  const aes_encrypt_ctx cx[1]);
-
-  AES_RETURN aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]);
-  AES_RETURN aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]);
-  AES_RETURN aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]);
-  AES_RETURN aes_decrypt(const unsigned char *in, unsigned char *out,
-                                                  const aes_decrypt_ctx cx[1]);
-
- IMPORTANT NOTE: If you are using this C interface with dynamic tables make sure that
- you call aes_init() before AES is used so that the tables are initialised.
-
- C++ aes class subroutines:
-
-     Class AESencrypt  for encryption
-
-      Construtors:
-          AESencrypt(void)
-          AESencrypt(const unsigned char *key) - 128 bit key
-      Members:
-          AES_RETURN key128(const unsigned char *key)
-          AES_RETURN key192(const unsigned char *key)
-          AES_RETURN key256(const unsigned char *key)
-          AES_RETURN encrypt(const unsigned char *in, unsigned char *out) const
-
-      Class AESdecrypt  for encryption
-      Construtors:
-          AESdecrypt(void)
-          AESdecrypt(const unsigned char *key) - 128 bit key
-      Members:
-          AES_RETURN key128(const unsigned char *key)
-          AES_RETURN key192(const unsigned char *key)
-          AES_RETURN key256(const unsigned char *key)
-          AES_RETURN decrypt(const unsigned char *in, unsigned char *out) const
-*/
-
-/* Adapted for TrueCrypt */
-
-#if !defined( _AESOPT_H )
-#define _AESOPT_H
-
-#ifdef TC_WINDOWS_BOOT
-#define ASM_X86_V2
-#endif
-
-#if defined( __cplusplus )
-#include "Aescpp.h"
-#else
-#include "Aes.h"
-#endif
-
-
-#include "Common/Endian.h"
-#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
-#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-#endif
-
-#if BYTE_ORDER == BIG_ENDIAN
-#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-#endif
-
-
-/*  CONFIGURATION - THE USE OF DEFINES
-
-    Later in this section there are a number of defines that control the
-    operation of the code.  In each section, the purpose of each define is
-    explained so that the relevant form can be included or excluded by
-    setting either 1's or 0's respectively on the branches of the related
-    #if clauses.  The following local defines should not be changed.
-*/
-
-#define ENCRYPTION_IN_C     1
-#define DECRYPTION_IN_C     2
-#define ENC_KEYING_IN_C     4
-#define DEC_KEYING_IN_C     8
-
-#define NO_TABLES           0
-#define ONE_TABLE           1
-#define FOUR_TABLES         4
-#define NONE                0
-#define PARTIAL             1
-#define FULL                2
-
-/*  --- START OF USER CONFIGURED OPTIONS --- */
-
-/*  1. BYTE ORDER WITHIN 32 BIT WORDS
-
-    The fundamental data processing units in Rijndael are 8-bit bytes. The
-    input, output and key input are all enumerated arrays of bytes in which
-    bytes are numbered starting at zero and increasing to one less than the
-    number of bytes in the array in question. This enumeration is only used
-    for naming bytes and does not imply any adjacency or order relationship
-    from one byte to another. When these inputs and outputs are considered
-    as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to
-    byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte.
-    In this implementation bits are numbered from 0 to 7 starting at the
-    numerically least significant end of each byte (bit n represents 2^n).
-
-    However, Rijndael can be implemented more efficiently using 32-bit
-    words by packing bytes into words so that bytes 4*n to 4*n+3 are placed
-    into word[n]. While in principle these bytes can be assembled into words
-    in any positions, this implementation only supports the two formats in
-    which bytes in adjacent positions within words also have adjacent byte
-    numbers. This order is called big-endian if the lowest numbered bytes
-    in words have the highest numeric significance and little-endian if the
-    opposite applies.
-
-    This code can work in either order irrespective of the order used by the
-    machine on which it runs. Normally the internal byte order will be set
-    to the order of the processor on which the code is to be run but this
-    define can be used to reverse this in special situations
-
-    WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set.
-    This define will hence be redefined later (in section 4) if necessary
-*/
-
-#if 1
-#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
-#elif 0
-#define ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN
-#elif 0
-#define ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN
-#else
-#error The algorithm byte order is not defined
-#endif
-
-/*  2. VIA ACE SUPPORT
-
-    Define this option if support for the VIA ACE is required. This uses
-    inline assembler instructions and is only implemented for the Microsoft,
-    Intel and GCC compilers.  If VIA ACE is known to be present, then defining
-    ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption
-    code.  If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if
-    it is detected (both present and enabled) but the normal AES code will
-    also be present.
-
-    When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte
-    aligned; other input/output buffers do not need to be 16 byte aligned
-    but there are very large performance gains if this can be arranged.
-    VIA ACE also requires the decryption key schedule to be in reverse
-    order (which later checks below ensure).
-*/
-
-#if 0 && !defined( USE_VIA_ACE_IF_PRESENT )
-#  define USE_VIA_ACE_IF_PRESENT
-#endif
-
-#if 0 && !defined( ASSUME_VIA_ACE_PRESENT )
-#  define ASSUME_VIA_ACE_PRESENT
-#  endif
-
-#if defined ( _WIN64 ) || defined( _WIN32_WCE ) || \
-                    defined( _MSC_VER ) && ( _MSC_VER <= 800 )
-#  if defined( USE_VIA_ACE_IF_PRESENT )
-#    undef USE_VIA_ACE_IF_PRESENT
-#  endif
-#  if defined( ASSUME_VIA_ACE_PRESENT )
-#    undef ASSUME_VIA_ACE_PRESENT
-#  endif
-#endif
-
-/*  3. ASSEMBLER SUPPORT
-
-    This define (which can be on the command line) enables the use of the
-    assembler code routines for encryption, decryption and key scheduling
-    as follows:
-
-    ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for
-                encryption and decryption and but with key scheduling in C
-    ASM_X86_V2  uses assembler (aes_x86_v2.asm) with compressed tables for
-                encryption, decryption and key scheduling
-    ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for
-                encryption and decryption and but with key scheduling in C
-    ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for
-                encryption and decryption and but with key scheduling in C
-
-    Change one 'if 0' below to 'if 1' to select the version or define
-    as a compilation option.
-*/
-
-#if 0 && !defined( ASM_X86_V1C )
-#  define ASM_X86_V1C
-#elif 0 && !defined( ASM_X86_V2  )
-#  define ASM_X86_V2
-#elif 0 && !defined( ASM_X86_V2C )
-#  define ASM_X86_V2C
-#elif 0 && !defined( ASM_AMD64_C )
-#  define ASM_AMD64_C
-#endif
-
-#if (defined ( ASM_X86_V1C ) || defined( ASM_X86_V2 ) || defined( ASM_X86_V2C )) \
-      && !defined( _M_IX86 ) || defined( ASM_AMD64_C ) && !defined( _M_X64 )
-//#  error Assembler code is only available for x86 and AMD64 systems
-#endif
-
-/*  4. FAST INPUT/OUTPUT OPERATIONS.
-
-    On some machines it is possible to improve speed by transferring the
-    bytes in the input and output arrays to and from the internal 32-bit
-    variables by addressing these arrays as if they are arrays of 32-bit
-    words.  On some machines this will always be possible but there may
-    be a large performance penalty if the byte arrays are not aligned on
-    the normal word boundaries. On other machines this technique will
-    lead to memory access errors when such 32-bit word accesses are not
-    properly aligned. The option SAFE_IO avoids such problems but will
-    often be slower on those machines that support misaligned access
-    (especially so if care is taken to align the input  and output byte
-    arrays on 32-bit word boundaries). If SAFE_IO is not defined it is
-    assumed that access to byte arrays as if they are arrays of 32-bit
-    words will not cause problems when such accesses are misaligned.
-*/
-#if 1 && !defined( _MSC_VER )
-#define SAFE_IO
-#endif
-
-/*  5. LOOP UNROLLING
-
-    The code for encryption and decrytpion cycles through a number of rounds
-    that can be implemented either in a loop or by expanding the code into a
-    long sequence of instructions, the latter producing a larger program but
-    one that will often be much faster. The latter is called loop unrolling.
-    There are also potential speed advantages in expanding two iterations in
-    a loop with half the number of iterations, which is called partial loop
-    unrolling.  The following options allow partial or full loop unrolling
-    to be set independently for encryption and decryption
-*/
-#if 1
-#define ENC_UNROLL  FULL
-#elif 0
-#define ENC_UNROLL  PARTIAL
-#else
-#define ENC_UNROLL  NONE
-#endif
-
-#if 1
-#define DEC_UNROLL  FULL
-#elif 0
-#define DEC_UNROLL  PARTIAL
-#else
-#define DEC_UNROLL  NONE
-#endif
-
-/*  6. FAST FINITE FIELD OPERATIONS
-
-    If this section is included, tables are used to provide faster finite
-    field arithmetic (this has no effect if FIXED_TABLES is defined).
-*/
-#if !defined (TC_WINDOWS_BOOT)
-#define FF_TABLES
-#endif
-
-/*  7. INTERNAL STATE VARIABLE FORMAT
-
-    The internal state of Rijndael is stored in a number of local 32-bit
-    word varaibles which can be defined either as an array or as individual
-    names variables. Include this section if you want to store these local
-    varaibles in arrays. Otherwise individual local variables will be used.
-*/
-#if 1
-#define ARRAYS
-#endif
-
-/*  8. FIXED OR DYNAMIC TABLES
-
-    When this section is included the tables used by the code are compiled
-    statically into the binary file.  Otherwise the subroutine aes_init()
-    must be called to compute them before the code is first used.
-*/
-#if !defined (TC_WINDOWS_BOOT) && !(defined( _MSC_VER ) && ( _MSC_VER <= 800 ))
-#define FIXED_TABLES
-#endif
-
-/*  9. TABLE ALIGNMENT
-
-    On some sytsems speed will be improved by aligning the AES large lookup
-    tables on particular boundaries. This define should be set to a power of
-    two giving the desired alignment. It can be left undefined if alignment
-    is not needed.  This option is specific to the Microsft VC++ compiler -
-    it seems to sometimes cause trouble for the VC++ version 6 compiler.
-*/
-
-#if 1 && defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
-#define TABLE_ALIGN 32
-#endif
-
-/*  10. TABLE OPTIONS
-
-    This cipher proceeds by repeating in a number of cycles known as 'rounds'
-    which are implemented by a round function which can optionally be speeded
-    up using tables.  The basic tables are each 256 32-bit words, with either
-    one or four tables being required for each round function depending on
-    how much speed is required. The encryption and decryption round functions
-    are different and the last encryption and decrytpion round functions are
-    different again making four different round functions in all.
-
-    This means that:
-      1. Normal encryption and decryption rounds can each use either 0, 1
-         or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
-      2. The last encryption and decryption rounds can also use either 0, 1
-         or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
-
-    Include or exclude the appropriate definitions below to set the number
-    of tables used by this implementation.
-*/
-
-#if 1   /* set tables for the normal encryption round */
-#define ENC_ROUND   FOUR_TABLES
-#elif 0
-#define ENC_ROUND   ONE_TABLE
-#else
-#define ENC_ROUND   NO_TABLES
-#endif
-
-#if 1   /* set tables for the last encryption round */
-#define LAST_ENC_ROUND  FOUR_TABLES
-#elif 0
-#define LAST_ENC_ROUND  ONE_TABLE
-#else
-#define LAST_ENC_ROUND  NO_TABLES
-#endif
-
-#if 1   /* set tables for the normal decryption round */
-#define DEC_ROUND   FOUR_TABLES
-#elif 0
-#define DEC_ROUND   ONE_TABLE
-#else
-#define DEC_ROUND   NO_TABLES
-#endif
-
-#if 1   /* set tables for the last decryption round */
-#define LAST_DEC_ROUND  FOUR_TABLES
-#elif 0
-#define LAST_DEC_ROUND  ONE_TABLE
-#else
-#define LAST_DEC_ROUND  NO_TABLES
-#endif
-
-/*  The decryption key schedule can be speeded up with tables in the same
-    way that the round functions can.  Include or exclude the following
-    defines to set this requirement.
-*/
-#if 1
-#define KEY_SCHED   FOUR_TABLES
-#elif 0
-#define KEY_SCHED   ONE_TABLE
-#else
-#define KEY_SCHED   NO_TABLES
-#endif
-
-/*  ---- END OF USER CONFIGURED OPTIONS ---- */
-
-/* VIA ACE support is only available for VC++ and GCC */
-
-#if !defined( _MSC_VER ) && !defined( __GNUC__ )
-#  if defined( ASSUME_VIA_ACE_PRESENT )
-#    undef ASSUME_VIA_ACE_PRESENT
-#  endif
-#  if defined( USE_VIA_ACE_IF_PRESENT )
-#    undef USE_VIA_ACE_IF_PRESENT
-#  endif
-#endif
-
-#if defined( ASSUME_VIA_ACE_PRESENT ) && !defined( USE_VIA_ACE_IF_PRESENT )
-#define USE_VIA_ACE_IF_PRESENT
-#endif
-
-#if defined( USE_VIA_ACE_IF_PRESENT ) && !defined ( AES_REV_DKS )
-#define AES_REV_DKS
-#endif
-
-/* Assembler support requires the use of platform byte order */
-
-#if ( defined( ASM_X86_V1C ) || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C ) ) \
-    && (ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER)
-#undef  ALGORITHM_BYTE_ORDER
-#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
-#endif
-
-/* In this implementation the columns of the state array are each held in
-   32-bit words. The state array can be held in various ways: in an array
-   of words, in a number of individual word variables or in a number of
-   processor registers. The following define maps a variable name x and
-   a column number c to the way the state array variable is to be held.
-   The first define below maps the state into an array x[c] whereas the
-   second form maps the state into a number of individual variables x0,
-   x1, etc.  Another form could map individual state colums to machine
-   register names.
-*/
-
-#if defined( ARRAYS )
-#define s(x,c) x[c]
-#else
-#define s(x,c) x##c
-#endif
-
-/*  This implementation provides subroutines for encryption, decryption
-    and for setting the three key lengths (separately) for encryption
-    and decryption. Since not all functions are needed, masks are set
-    up here to determine which will be implemented in C
-*/
-
-#if !defined( AES_ENCRYPT )
-#  define EFUNCS_IN_C   0
-#elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) \
-    || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C )
-#  define EFUNCS_IN_C   ENC_KEYING_IN_C
-#elif !defined( ASM_X86_V2 )
-#  define EFUNCS_IN_C   ( ENCRYPTION_IN_C | ENC_KEYING_IN_C )
-#else
-#  define EFUNCS_IN_C   0
-#endif
-
-#if !defined( AES_DECRYPT )
-#  define DFUNCS_IN_C   0
-#elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) \
-    || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C )
-#  define DFUNCS_IN_C   DEC_KEYING_IN_C
-#elif !defined( ASM_X86_V2 )
-#  define DFUNCS_IN_C   ( DECRYPTION_IN_C | DEC_KEYING_IN_C )
-#else
-#  define DFUNCS_IN_C   0
-#endif
-
-#define FUNCS_IN_C  ( EFUNCS_IN_C | DFUNCS_IN_C )
-
-/* END OF CONFIGURATION OPTIONS */
-
-#define RC_LENGTH   (5 * (AES_BLOCK_SIZE / 4 - 2))
-
-/* Disable or report errors on some combinations of options */
-
-#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES
-#undef  LAST_ENC_ROUND
-#define LAST_ENC_ROUND  NO_TABLES
-#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES
-#undef  LAST_ENC_ROUND
-#define LAST_ENC_ROUND  ONE_TABLE
-#endif
-
-#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE
-#undef  ENC_UNROLL
-#define ENC_UNROLL  NONE
-#endif
-
-#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES
-#undef  LAST_DEC_ROUND
-#define LAST_DEC_ROUND  NO_TABLES
-#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES
-#undef  LAST_DEC_ROUND
-#define LAST_DEC_ROUND  ONE_TABLE
-#endif
-
-#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE
-#undef  DEC_UNROLL
-#define DEC_UNROLL  NONE
-#endif
-
-#if defined( bswap32 )
-#define aes_sw32    bswap32
-#elif defined( bswap_32 )
-#define aes_sw32    bswap_32
-#else
-#define brot(x,n)   (((uint_32t)(x) <<  n) | ((uint_32t)(x) >> (32 - n)))
-#define aes_sw32(x) ((brot((x),8) & 0x00ff00ff) | (brot((x),24) & 0xff00ff00))
-#endif
-
-/*  upr(x,n):  rotates bytes within words by n positions, moving bytes to
-               higher index positions with wrap around into low positions
-    ups(x,n):  moves bytes by n positions to higher index positions in
-               words but without wrap around
-    bval(x,n): extracts a byte from a word
-
-    WARNING:   The definitions given here are intended only for use with
-               unsigned variables and with shift counts that are compile
-               time constants
-*/
-
-#if ( ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN )
-#define upr(x,n)        (((uint_32t)(x) << (8 * (n))) | ((uint_32t)(x) >> (32 - 8 * (n))))
-#define ups(x,n)        ((uint_32t) (x) << (8 * (n)))
-#define bval(x,n)       ((uint_8t)((x) >> (8 * (n))))
-#define bytes2word(b0, b1, b2, b3)  \
-        (((uint_32t)(b3) << 24) | ((uint_32t)(b2) << 16) | ((uint_32t)(b1) << 8) | (b0))
-#endif
-
-#if ( ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN )
-#define upr(x,n)        (((uint_32t)(x) >> (8 * (n))) | ((uint_32t)(x) << (32 - 8 * (n))))
-#define ups(x,n)        ((uint_32t) (x) >> (8 * (n)))
-#define bval(x,n)       ((uint_8t)((x) >> (24 - 8 * (n))))
-#define bytes2word(b0, b1, b2, b3)  \
-        (((uint_32t)(b0) << 24) | ((uint_32t)(b1) << 16) | ((uint_32t)(b2) << 8) | (b3))
-#endif
-
-#if defined( SAFE_IO )
-
-#define word_in(x,c)    bytes2word(((const uint_8t*)(x)+4*c)[0], ((const uint_8t*)(x)+4*c)[1], \
-                                   ((const uint_8t*)(x)+4*c)[2], ((const uint_8t*)(x)+4*c)[3])
-#define word_out(x,c,v) { ((uint_8t*)(x)+4*c)[0] = bval(v,0); ((uint_8t*)(x)+4*c)[1] = bval(v,1); \
-                          ((uint_8t*)(x)+4*c)[2] = bval(v,2); ((uint_8t*)(x)+4*c)[3] = bval(v,3); }
-
-#elif ( ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER )
-
-#define word_in(x,c)    (*((uint_32t*)(x)+(c)))
-#define word_out(x,c,v) (*((uint_32t*)(x)+(c)) = (v))
-
-#else
-
-#define word_in(x,c)    aes_sw32(*((uint_32t*)(x)+(c)))
-#define word_out(x,c,v) (*((uint_32t*)(x)+(c)) = aes_sw32(v))
-
-#endif
-
-/* the finite field modular polynomial and elements */
-
-#define WPOLY   0x011b
-#define BPOLY     0x1b
-
-/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
-
-#define m1  0x80808080
-#define m2  0x7f7f7f7f
-#define gf_mulx(x)  ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY))
-
-/* The following defines provide alternative definitions of gf_mulx that might
-   give improved performance if a fast 32-bit multiply is not available. Note
-   that a temporary variable u needs to be defined where gf_mulx is used.
-
-#define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6))
-#define m4  (0x01010101 * BPOLY)
-#define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4)
-*/
-
-/* Work out which tables are needed for the different options   */
-
-#if defined( ASM_X86_V1C )
-#if defined( ENC_ROUND )
-#undef  ENC_ROUND
-#endif
-#define ENC_ROUND   FOUR_TABLES
-#if defined( LAST_ENC_ROUND )
-#undef  LAST_ENC_ROUND
-#endif
-#define LAST_ENC_ROUND  FOUR_TABLES
-#if defined( DEC_ROUND )
-#undef  DEC_ROUND
-#endif
-#define DEC_ROUND   FOUR_TABLES
-#if defined( LAST_DEC_ROUND )
-#undef  LAST_DEC_ROUND
-#endif
-#define LAST_DEC_ROUND  FOUR_TABLES
-#if defined( KEY_SCHED )
-#undef  KEY_SCHED
-#define KEY_SCHED   FOUR_TABLES
-#endif
-#endif
-
-#if ( FUNCS_IN_C & ENCRYPTION_IN_C ) || defined( ASM_X86_V1C )
-#if ENC_ROUND == ONE_TABLE
-#define FT1_SET
-#elif ENC_ROUND == FOUR_TABLES
-#define FT4_SET
-#else
-#define SBX_SET
-#endif
-#if LAST_ENC_ROUND == ONE_TABLE
-#define FL1_SET
-#elif LAST_ENC_ROUND == FOUR_TABLES
-#define FL4_SET
-#elif !defined( SBX_SET )
-#define SBX_SET
-#endif
-#endif
-
-#if ( FUNCS_IN_C & DECRYPTION_IN_C ) || defined( ASM_X86_V1C )
-#if DEC_ROUND == ONE_TABLE
-#define IT1_SET
-#elif DEC_ROUND == FOUR_TABLES
-#define IT4_SET
-#else
-#define ISB_SET
-#endif
-#if LAST_DEC_ROUND == ONE_TABLE
-#define IL1_SET
-#elif LAST_DEC_ROUND == FOUR_TABLES
-#define IL4_SET
-#elif !defined(ISB_SET)
-#define ISB_SET
-#endif
-#endif
-
-#if (FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C)
-#if KEY_SCHED == ONE_TABLE
-#define LS1_SET
-#elif KEY_SCHED == FOUR_TABLES
-#define LS4_SET
-#elif !defined( SBX_SET )
-#define SBX_SET
-#endif
-#endif
-
-#if (FUNCS_IN_C & DEC_KEYING_IN_C)
-#if KEY_SCHED == ONE_TABLE
-#define IM1_SET
-#elif KEY_SCHED == FOUR_TABLES
-#define IM4_SET
-#elif !defined( SBX_SET )
-#define SBX_SET
-#endif
-#endif
-
-/* generic definitions of Rijndael macros that use tables    */
-
-#define no_table(x,box,vf,rf,c) bytes2word( \
-    box[bval(vf(x,0,c),rf(0,c))], \
-    box[bval(vf(x,1,c),rf(1,c))], \
-    box[bval(vf(x,2,c),rf(2,c))], \
-    box[bval(vf(x,3,c),rf(3,c))])
-
-#define one_table(x,op,tab,vf,rf,c) \
- (     tab[bval(vf(x,0,c),rf(0,c))] \
-  ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \
-  ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \
-  ^ op(tab[bval(vf(x,3,c),rf(3,c))],3))
-
-#define four_tables(x,tab,vf,rf,c) \
- (  tab[0][bval(vf(x,0,c),rf(0,c))] \
-  ^ tab[1][bval(vf(x,1,c),rf(1,c))] \
-  ^ tab[2][bval(vf(x,2,c),rf(2,c))] \
-  ^ tab[3][bval(vf(x,3,c),rf(3,c))])
-
-#define vf1(x,r,c)  (x)
-#define rf1(r,c)    (r)
-#define rf2(r,c)    ((8+r-c)&3)
-
-/* perform forward and inverse column mix operation on four bytes in long word x in */
-/* parallel. NOTE: x must be a simple variable, NOT an expression in these macros.  */
-
-#if defined( FM4_SET )    /* not currently used */
-#define fwd_mcol(x)       four_tables(x,t_use(f,m),vf1,rf1,0)
-#elif defined( FM1_SET )  /* not currently used */
-#define fwd_mcol(x)       one_table(x,upr,t_use(f,m),vf1,rf1,0)
-#else
-#define dec_fmvars        uint_32t g2
-#define fwd_mcol(x)       (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ upr((x), 2) ^ upr((x), 1))
-#endif
-
-#if defined( IM4_SET )
-#define inv_mcol(x)       four_tables(x,t_use(i,m),vf1,rf1,0)
-#elif defined( IM1_SET )
-#define inv_mcol(x)       one_table(x,upr,t_use(i,m),vf1,rf1,0)
-#else
-#define dec_imvars        uint_32t g2, g4, g9
-#define inv_mcol(x)       (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = (x) ^ gf_mulx(g4), g4 ^= g9, \
-                          (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ upr(g4, 2) ^ upr(g9, 1))
-#endif
-
-#if defined( FL4_SET )
-#define ls_box(x,c)       four_tables(x,t_use(f,l),vf1,rf2,c)
-#elif   defined( LS4_SET )
-#define ls_box(x,c)       four_tables(x,t_use(l,s),vf1,rf2,c)
-#elif defined( FL1_SET )
-#define ls_box(x,c)       one_table(x,upr,t_use(f,l),vf1,rf2,c)
-#elif defined( LS1_SET )
-#define ls_box(x,c)       one_table(x,upr,t_use(l,s),vf1,rf2,c)
-#else
-#define ls_box(x,c)     no_table(x,t_use(s,box),vf1,rf2,c)
-#endif
-
-#if defined( ASM_X86_V1C ) && defined( AES_DECRYPT ) && !defined( ISB_SET )
-#define ISB_SET
-#endif
-
-#endif
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+
+ This file contains the compilation options for AES (Rijndael) and code
+ that is common across encryption, key scheduling and table generation.
+
+ OPERATION
+
+ These source code files implement the AES algorithm Rijndael designed by
+ Joan Daemen and Vincent Rijmen. This version is designed for the standard
+ block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24
+ and 32 bytes).
+
+ This version is designed for flexibility and speed using operations on
+ 32-bit words rather than operations on bytes.  It can be compiled with
+ either big or little endian internal byte order but is faster when the
+ native byte order for the processor is used.
+
+ THE CIPHER INTERFACE
+
+ The cipher interface is implemented as an array of bytes in which lower
+ AES bit sequence indexes map to higher numeric significance within bytes.
+
+  uint_8t                 (an unsigned  8-bit type)
+  uint_32t                (an unsigned 32-bit type)
+  struct aes_encrypt_ctx  (structure for the cipher encryption context)
+  struct aes_decrypt_ctx  (structure for the cipher decryption context)
+  AES_RETURN                the function return type
+
+  C subroutine calls:
+
+  AES_RETURN aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]);
+  AES_RETURN aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]);
+  AES_RETURN aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]);
+  AES_RETURN aes_encrypt(const unsigned char *in, unsigned char *out,
+                                                  const aes_encrypt_ctx cx[1]);
+
+  AES_RETURN aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]);
+  AES_RETURN aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]);
+  AES_RETURN aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]);
+  AES_RETURN aes_decrypt(const unsigned char *in, unsigned char *out,
+                                                  const aes_decrypt_ctx cx[1]);
+
+ IMPORTANT NOTE: If you are using this C interface with dynamic tables make sure that
+ you call aes_init() before AES is used so that the tables are initialised.
+
+ C++ aes class subroutines:
+
+     Class AESencrypt  for encryption
+
+      Construtors:
+          AESencrypt(void)
+          AESencrypt(const unsigned char *key) - 128 bit key
+      Members:
+          AES_RETURN key128(const unsigned char *key)
+          AES_RETURN key192(const unsigned char *key)
+          AES_RETURN key256(const unsigned char *key)
+          AES_RETURN encrypt(const unsigned char *in, unsigned char *out) const
+
+      Class AESdecrypt  for encryption
+      Construtors:
+          AESdecrypt(void)
+          AESdecrypt(const unsigned char *key) - 128 bit key
+      Members:
+          AES_RETURN key128(const unsigned char *key)
+          AES_RETURN key192(const unsigned char *key)
+          AES_RETURN key256(const unsigned char *key)
+          AES_RETURN decrypt(const unsigned char *in, unsigned char *out) const
+*/
+
+/* Adapted for TrueCrypt */
+
+#if !defined( _AESOPT_H )
+#define _AESOPT_H
+
+#ifdef TC_WINDOWS_BOOT
+#define ASM_X86_V2
+#endif
+
+#if defined( __cplusplus )
+#include "Aescpp.h"
+#else
+#include "Aes.h"
+#endif
+
+
+#include "Common/Endian.h"
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if BYTE_ORDER == BIG_ENDIAN
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#endif
+
+
+/*  CONFIGURATION - THE USE OF DEFINES
+
+    Later in this section there are a number of defines that control the
+    operation of the code.  In each section, the purpose of each define is
+    explained so that the relevant form can be included or excluded by
+    setting either 1's or 0's respectively on the branches of the related
+    #if clauses.  The following local defines should not be changed.
+*/
+
+#define ENCRYPTION_IN_C     1
+#define DECRYPTION_IN_C     2
+#define ENC_KEYING_IN_C     4
+#define DEC_KEYING_IN_C     8
+
+#define NO_TABLES           0
+#define ONE_TABLE           1
+#define FOUR_TABLES         4
+#define NONE                0
+#define PARTIAL             1
+#define FULL                2
+
+/*  --- START OF USER CONFIGURED OPTIONS --- */
+
+/*  1. BYTE ORDER WITHIN 32 BIT WORDS
+
+    The fundamental data processing units in Rijndael are 8-bit bytes. The
+    input, output and key input are all enumerated arrays of bytes in which
+    bytes are numbered starting at zero and increasing to one less than the
+    number of bytes in the array in question. This enumeration is only used
+    for naming bytes and does not imply any adjacency or order relationship
+    from one byte to another. When these inputs and outputs are considered
+    as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to
+    byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte.
+    In this implementation bits are numbered from 0 to 7 starting at the
+    numerically least significant end of each byte (bit n represents 2^n).
+
+    However, Rijndael can be implemented more efficiently using 32-bit
+    words by packing bytes into words so that bytes 4*n to 4*n+3 are placed
+    into word[n]. While in principle these bytes can be assembled into words
+    in any positions, this implementation only supports the two formats in
+    which bytes in adjacent positions within words also have adjacent byte
+    numbers. This order is called big-endian if the lowest numbered bytes
+    in words have the highest numeric significance and little-endian if the
+    opposite applies.
+
+    This code can work in either order irrespective of the order used by the
+    machine on which it runs. Normally the internal byte order will be set
+    to the order of the processor on which the code is to be run but this
+    define can be used to reverse this in special situations
+
+    WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set.
+    This define will hence be redefined later (in section 4) if necessary
+*/
+
+#if 1
+#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#elif 0
+#define ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0
+#define ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#error The algorithm byte order is not defined
+#endif
+
+/*  2. VIA ACE SUPPORT
+
+    Define this option if support for the VIA ACE is required. This uses
+    inline assembler instructions and is only implemented for the Microsoft,
+    Intel and GCC compilers.  If VIA ACE is known to be present, then defining
+    ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption
+    code.  If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if
+    it is detected (both present and enabled) but the normal AES code will
+    also be present.
+
+    When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte
+    aligned; other input/output buffers do not need to be 16 byte aligned
+    but there are very large performance gains if this can be arranged.
+    VIA ACE also requires the decryption key schedule to be in reverse
+    order (which later checks below ensure).
+*/
+
+#if 0 && !defined( USE_VIA_ACE_IF_PRESENT )
+#  define USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if 0 && !defined( ASSUME_VIA_ACE_PRESENT )
+#  define ASSUME_VIA_ACE_PRESENT
+#  endif
+
+#if defined ( _WIN64 ) || defined( _WIN32_WCE ) || \
+                    defined( _MSC_VER ) && ( _MSC_VER <= 800 )
+#  if defined( USE_VIA_ACE_IF_PRESENT )
+#    undef USE_VIA_ACE_IF_PRESENT
+#  endif
+#  if defined( ASSUME_VIA_ACE_PRESENT )
+#    undef ASSUME_VIA_ACE_PRESENT
+#  endif
+#endif
+
+/*  3. ASSEMBLER SUPPORT
+
+    This define (which can be on the command line) enables the use of the
+    assembler code routines for encryption, decryption and key scheduling
+    as follows:
+
+    ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for
+                encryption and decryption and but with key scheduling in C
+    ASM_X86_V2  uses assembler (aes_x86_v2.asm) with compressed tables for
+                encryption, decryption and key scheduling
+    ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for
+                encryption and decryption and but with key scheduling in C
+    ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for
+                encryption and decryption and but with key scheduling in C
+
+    Change one 'if 0' below to 'if 1' to select the version or define
+    as a compilation option.
+*/
+
+#if 0 && !defined( ASM_X86_V1C )
+#  define ASM_X86_V1C
+#elif 0 && !defined( ASM_X86_V2  )
+#  define ASM_X86_V2
+#elif 0 && !defined( ASM_X86_V2C )
+#  define ASM_X86_V2C
+#elif 0 && !defined( ASM_AMD64_C )
+#  define ASM_AMD64_C
+#endif
+
+#if (defined ( ASM_X86_V1C ) || defined( ASM_X86_V2 ) || defined( ASM_X86_V2C )) \
+      && !defined( _M_IX86 ) || defined( ASM_AMD64_C ) && !defined( _M_X64 )
+//#  error Assembler code is only available for x86 and AMD64 systems
+#endif
+
+/*  4. FAST INPUT/OUTPUT OPERATIONS.
+
+    On some machines it is possible to improve speed by transferring the
+    bytes in the input and output arrays to and from the internal 32-bit
+    variables by addressing these arrays as if they are arrays of 32-bit
+    words.  On some machines this will always be possible but there may
+    be a large performance penalty if the byte arrays are not aligned on
+    the normal word boundaries. On other machines this technique will
+    lead to memory access errors when such 32-bit word accesses are not
+    properly aligned. The option SAFE_IO avoids such problems but will
+    often be slower on those machines that support misaligned access
+    (especially so if care is taken to align the input  and output byte
+    arrays on 32-bit word boundaries). If SAFE_IO is not defined it is
+    assumed that access to byte arrays as if they are arrays of 32-bit
+    words will not cause problems when such accesses are misaligned.
+*/
+#if 1 && !defined( _MSC_VER )
+#define SAFE_IO
+#endif
+
+/*  5. LOOP UNROLLING
+
+    The code for encryption and decrytpion cycles through a number of rounds
+    that can be implemented either in a loop or by expanding the code into a
+    long sequence of instructions, the latter producing a larger program but
+    one that will often be much faster. The latter is called loop unrolling.
+    There are also potential speed advantages in expanding two iterations in
+    a loop with half the number of iterations, which is called partial loop
+    unrolling.  The following options allow partial or full loop unrolling
+    to be set independently for encryption and decryption
+*/
+#if 1
+#define ENC_UNROLL  FULL
+#elif 0
+#define ENC_UNROLL  PARTIAL
+#else
+#define ENC_UNROLL  NONE
+#endif
+
+#if 1
+#define DEC_UNROLL  FULL
+#elif 0
+#define DEC_UNROLL  PARTIAL
+#else
+#define DEC_UNROLL  NONE
+#endif
+
+/*  6. FAST FINITE FIELD OPERATIONS
+
+    If this section is included, tables are used to provide faster finite
+    field arithmetic (this has no effect if FIXED_TABLES is defined).
+*/
+#if !defined (TC_WINDOWS_BOOT)
+#define FF_TABLES
+#endif
+
+/*  7. INTERNAL STATE VARIABLE FORMAT
+
+    The internal state of Rijndael is stored in a number of local 32-bit
+    word varaibles which can be defined either as an array or as individual
+    names variables. Include this section if you want to store these local
+    varaibles in arrays. Otherwise individual local variables will be used.
+*/
+#if 1
+#define ARRAYS
+#endif
+
+/*  8. FIXED OR DYNAMIC TABLES
+
+    When this section is included the tables used by the code are compiled
+    statically into the binary file.  Otherwise the subroutine aes_init()
+    must be called to compute them before the code is first used.
+*/
+#if !defined (TC_WINDOWS_BOOT) && !(defined( _MSC_VER ) && ( _MSC_VER <= 800 ))
+#define FIXED_TABLES
+#endif
+
+/*  9. TABLE ALIGNMENT
+
+    On some sytsems speed will be improved by aligning the AES large lookup
+    tables on particular boundaries. This define should be set to a power of
+    two giving the desired alignment. It can be left undefined if alignment
+    is not needed.  This option is specific to the Microsft VC++ compiler -
+    it seems to sometimes cause trouble for the VC++ version 6 compiler.
+*/
+
+#if 1 && defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
+#define TABLE_ALIGN 32
+#endif
+
+/*  10. TABLE OPTIONS
+
+    This cipher proceeds by repeating in a number of cycles known as 'rounds'
+    which are implemented by a round function which can optionally be speeded
+    up using tables.  The basic tables are each 256 32-bit words, with either
+    one or four tables being required for each round function depending on
+    how much speed is required. The encryption and decryption round functions
+    are different and the last encryption and decrytpion round functions are
+    different again making four different round functions in all.
+
+    This means that:
+      1. Normal encryption and decryption rounds can each use either 0, 1
+         or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
+      2. The last encryption and decryption rounds can also use either 0, 1
+         or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
+
+    Include or exclude the appropriate definitions below to set the number
+    of tables used by this implementation.
+*/
+
+#if 1   /* set tables for the normal encryption round */
+#define ENC_ROUND   FOUR_TABLES
+#elif 0
+#define ENC_ROUND   ONE_TABLE
+#else
+#define ENC_ROUND   NO_TABLES
+#endif
+
+#if 1   /* set tables for the last encryption round */
+#define LAST_ENC_ROUND  FOUR_TABLES
+#elif 0
+#define LAST_ENC_ROUND  ONE_TABLE
+#else
+#define LAST_ENC_ROUND  NO_TABLES
+#endif
+
+#if 1   /* set tables for the normal decryption round */
+#define DEC_ROUND   FOUR_TABLES
+#elif 0
+#define DEC_ROUND   ONE_TABLE
+#else
+#define DEC_ROUND   NO_TABLES
+#endif
+
+#if 1   /* set tables for the last decryption round */
+#define LAST_DEC_ROUND  FOUR_TABLES
+#elif 0
+#define LAST_DEC_ROUND  ONE_TABLE
+#else
+#define LAST_DEC_ROUND  NO_TABLES
+#endif
+
+/*  The decryption key schedule can be speeded up with tables in the same
+    way that the round functions can.  Include or exclude the following
+    defines to set this requirement.
+*/
+#if 1
+#define KEY_SCHED   FOUR_TABLES
+#elif 0
+#define KEY_SCHED   ONE_TABLE
+#else
+#define KEY_SCHED   NO_TABLES
+#endif
+
+/*  ---- END OF USER CONFIGURED OPTIONS ---- */
+
+/* VIA ACE support is only available for VC++ and GCC */
+
+#if !defined( _MSC_VER ) && !defined( __GNUC__ )
+#  if defined( ASSUME_VIA_ACE_PRESENT )
+#    undef ASSUME_VIA_ACE_PRESENT
+#  endif
+#  if defined( USE_VIA_ACE_IF_PRESENT )
+#    undef USE_VIA_ACE_IF_PRESENT
+#  endif
+#endif
+
+#if defined( ASSUME_VIA_ACE_PRESENT ) && !defined( USE_VIA_ACE_IF_PRESENT )
+#define USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if defined( USE_VIA_ACE_IF_PRESENT ) && !defined ( AES_REV_DKS )
+#define AES_REV_DKS
+#endif
+
+/* Assembler support requires the use of platform byte order */
+
+#if ( defined( ASM_X86_V1C ) || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C ) ) \
+    && (ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER)
+#undef  ALGORITHM_BYTE_ORDER
+#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#endif
+
+/* In this implementation the columns of the state array are each held in
+   32-bit words. The state array can be held in various ways: in an array
+   of words, in a number of individual word variables or in a number of
+   processor registers. The following define maps a variable name x and
+   a column number c to the way the state array variable is to be held.
+   The first define below maps the state into an array x[c] whereas the
+   second form maps the state into a number of individual variables x0,
+   x1, etc.  Another form could map individual state colums to machine
+   register names.
+*/
+
+#if defined( ARRAYS )
+#define s(x,c) x[c]
+#else
+#define s(x,c) x##c
+#endif
+
+/*  This implementation provides subroutines for encryption, decryption
+    and for setting the three key lengths (separately) for encryption
+    and decryption. Since not all functions are needed, masks are set
+    up here to determine which will be implemented in C
+*/
+
+#if !defined( AES_ENCRYPT )
+#  define EFUNCS_IN_C   0
+#elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) \
+    || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C )
+#  define EFUNCS_IN_C   ENC_KEYING_IN_C
+#elif !defined( ASM_X86_V2 )
+#  define EFUNCS_IN_C   ( ENCRYPTION_IN_C | ENC_KEYING_IN_C )
+#else
+#  define EFUNCS_IN_C   0
+#endif
+
+#if !defined( AES_DECRYPT )
+#  define DFUNCS_IN_C   0
+#elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) \
+    || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C )
+#  define DFUNCS_IN_C   DEC_KEYING_IN_C
+#elif !defined( ASM_X86_V2 )
+#  define DFUNCS_IN_C   ( DECRYPTION_IN_C | DEC_KEYING_IN_C )
+#else
+#  define DFUNCS_IN_C   0
+#endif
+
+#define FUNCS_IN_C  ( EFUNCS_IN_C | DFUNCS_IN_C )
+
+/* END OF CONFIGURATION OPTIONS */
+
+#define RC_LENGTH   (5 * (AES_BLOCK_SIZE / 4 - 2))
+
+/* Disable or report errors on some combinations of options */
+
+#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES
+#undef  LAST_ENC_ROUND
+#define LAST_ENC_ROUND  NO_TABLES
+#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES
+#undef  LAST_ENC_ROUND
+#define LAST_ENC_ROUND  ONE_TABLE
+#endif
+
+#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE
+#undef  ENC_UNROLL
+#define ENC_UNROLL  NONE
+#endif
+
+#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES
+#undef  LAST_DEC_ROUND
+#define LAST_DEC_ROUND  NO_TABLES
+#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES
+#undef  LAST_DEC_ROUND
+#define LAST_DEC_ROUND  ONE_TABLE
+#endif
+
+#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE
+#undef  DEC_UNROLL
+#define DEC_UNROLL  NONE
+#endif
+
+#if defined( bswap32 )
+#define aes_sw32    bswap32
+#elif defined( bswap_32 )
+#define aes_sw32    bswap_32
+#else
+#define brot(x,n)   (((uint_32t)(x) <<  n) | ((uint_32t)(x) >> (32 - n)))
+#define aes_sw32(x) ((brot((x),8) & 0x00ff00ff) | (brot((x),24) & 0xff00ff00))
+#endif
+
+/*  upr(x,n):  rotates bytes within words by n positions, moving bytes to
+               higher index positions with wrap around into low positions
+    ups(x,n):  moves bytes by n positions to higher index positions in
+               words but without wrap around
+    bval(x,n): extracts a byte from a word
+
+    WARNING:   The definitions given here are intended only for use with
+               unsigned variables and with shift counts that are compile
+               time constants
+*/
+
+#if ( ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN )
+#define upr(x,n)        (((uint_32t)(x) << (8 * (n))) | ((uint_32t)(x) >> (32 - 8 * (n))))
+#define ups(x,n)        ((uint_32t) (x) << (8 * (n)))
+#define bval(x,n)       ((uint_8t)((x) >> (8 * (n))))
+#define bytes2word(b0, b1, b2, b3)  \
+        (((uint_32t)(b3) << 24) | ((uint_32t)(b2) << 16) | ((uint_32t)(b1) << 8) | (b0))
+#endif
+
+#if ( ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN )
+#define upr(x,n)        (((uint_32t)(x) >> (8 * (n))) | ((uint_32t)(x) << (32 - 8 * (n))))
+#define ups(x,n)        ((uint_32t) (x) >> (8 * (n)))
+#define bval(x,n)       ((uint_8t)((x) >> (24 - 8 * (n))))
+#define bytes2word(b0, b1, b2, b3)  \
+        (((uint_32t)(b0) << 24) | ((uint_32t)(b1) << 16) | ((uint_32t)(b2) << 8) | (b3))
+#endif
+
+#if defined( SAFE_IO )
+
+#define word_in(x,c)    bytes2word(((const uint_8t*)(x)+4*c)[0], ((const uint_8t*)(x)+4*c)[1], \
+                                   ((const uint_8t*)(x)+4*c)[2], ((const uint_8t*)(x)+4*c)[3])
+#define word_out(x,c,v) { ((uint_8t*)(x)+4*c)[0] = bval(v,0); ((uint_8t*)(x)+4*c)[1] = bval(v,1); \
+                          ((uint_8t*)(x)+4*c)[2] = bval(v,2); ((uint_8t*)(x)+4*c)[3] = bval(v,3); }
+
+#elif ( ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER )
+
+#define word_in(x,c)    (*((uint_32t*)(x)+(c)))
+#define word_out(x,c,v) (*((uint_32t*)(x)+(c)) = (v))
+
+#else
+
+#define word_in(x,c)    aes_sw32(*((uint_32t*)(x)+(c)))
+#define word_out(x,c,v) (*((uint_32t*)(x)+(c)) = aes_sw32(v))
+
+#endif
+
+/* the finite field modular polynomial and elements */
+
+#define WPOLY   0x011b
+#define BPOLY     0x1b
+
+/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
+
+#define m1  0x80808080
+#define m2  0x7f7f7f7f
+#define gf_mulx(x)  ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY))
+
+/* The following defines provide alternative definitions of gf_mulx that might
+   give improved performance if a fast 32-bit multiply is not available. Note
+   that a temporary variable u needs to be defined where gf_mulx is used.
+
+#define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6))
+#define m4  (0x01010101 * BPOLY)
+#define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4)
+*/
+
+/* Work out which tables are needed for the different options   */
+
+#if defined( ASM_X86_V1C )
+#if defined( ENC_ROUND )
+#undef  ENC_ROUND
+#endif
+#define ENC_ROUND   FOUR_TABLES
+#if defined( LAST_ENC_ROUND )
+#undef  LAST_ENC_ROUND
+#endif
+#define LAST_ENC_ROUND  FOUR_TABLES
+#if defined( DEC_ROUND )
+#undef  DEC_ROUND
+#endif
+#define DEC_ROUND   FOUR_TABLES
+#if defined( LAST_DEC_ROUND )
+#undef  LAST_DEC_ROUND
+#endif
+#define LAST_DEC_ROUND  FOUR_TABLES
+#if defined( KEY_SCHED )
+#undef  KEY_SCHED
+#define KEY_SCHED   FOUR_TABLES
+#endif
+#endif
+
+#if ( FUNCS_IN_C & ENCRYPTION_IN_C ) || defined( ASM_X86_V1C )
+#if ENC_ROUND == ONE_TABLE
+#define FT1_SET
+#elif ENC_ROUND == FOUR_TABLES
+#define FT4_SET
+#else
+#define SBX_SET
+#endif
+#if LAST_ENC_ROUND == ONE_TABLE
+#define FL1_SET
+#elif LAST_ENC_ROUND == FOUR_TABLES
+#define FL4_SET
+#elif !defined( SBX_SET )
+#define SBX_SET
+#endif
+#endif
+
+#if ( FUNCS_IN_C & DECRYPTION_IN_C ) || defined( ASM_X86_V1C )
+#if DEC_ROUND == ONE_TABLE
+#define IT1_SET
+#elif DEC_ROUND == FOUR_TABLES
+#define IT4_SET
+#else
+#define ISB_SET
+#endif
+#if LAST_DEC_ROUND == ONE_TABLE
+#define IL1_SET
+#elif LAST_DEC_ROUND == FOUR_TABLES
+#define IL4_SET
+#elif !defined(ISB_SET)
+#define ISB_SET
+#endif
+#endif
+
+#if (FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C)
+#if KEY_SCHED == ONE_TABLE
+#define LS1_SET
+#elif KEY_SCHED == FOUR_TABLES
+#define LS4_SET
+#elif !defined( SBX_SET )
+#define SBX_SET
+#endif
+#endif
+
+#if (FUNCS_IN_C & DEC_KEYING_IN_C)
+#if KEY_SCHED == ONE_TABLE
+#define IM1_SET
+#elif KEY_SCHED == FOUR_TABLES
+#define IM4_SET
+#elif !defined( SBX_SET )
+#define SBX_SET
+#endif
+#endif
+
+/* generic definitions of Rijndael macros that use tables    */
+
+#define no_table(x,box,vf,rf,c) bytes2word( \
+    box[bval(vf(x,0,c),rf(0,c))], \
+    box[bval(vf(x,1,c),rf(1,c))], \
+    box[bval(vf(x,2,c),rf(2,c))], \
+    box[bval(vf(x,3,c),rf(3,c))])
+
+#define one_table(x,op,tab,vf,rf,c) \
+ (     tab[bval(vf(x,0,c),rf(0,c))] \
+  ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \
+  ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \
+  ^ op(tab[bval(vf(x,3,c),rf(3,c))],3))
+
+#define four_tables(x,tab,vf,rf,c) \
+ (  tab[0][bval(vf(x,0,c),rf(0,c))] \
+  ^ tab[1][bval(vf(x,1,c),rf(1,c))] \
+  ^ tab[2][bval(vf(x,2,c),rf(2,c))] \
+  ^ tab[3][bval(vf(x,3,c),rf(3,c))])
+
+#define vf1(x,r,c)  (x)
+#define rf1(r,c)    (r)
+#define rf2(r,c)    ((8+r-c)&3)
+
+/* perform forward and inverse column mix operation on four bytes in long word x in */
+/* parallel. NOTE: x must be a simple variable, NOT an expression in these macros.  */
+
+#if defined( FM4_SET )    /* not currently used */
+#define fwd_mcol(x)       four_tables(x,t_use(f,m),vf1,rf1,0)
+#elif defined( FM1_SET )  /* not currently used */
+#define fwd_mcol(x)       one_table(x,upr,t_use(f,m),vf1,rf1,0)
+#else
+#define dec_fmvars        uint_32t g2
+#define fwd_mcol(x)       (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ upr((x), 2) ^ upr((x), 1))
+#endif
+
+#if defined( IM4_SET )
+#define inv_mcol(x)       four_tables(x,t_use(i,m),vf1,rf1,0)
+#elif defined( IM1_SET )
+#define inv_mcol(x)       one_table(x,upr,t_use(i,m),vf1,rf1,0)
+#else
+#define dec_imvars        uint_32t g2, g4, g9
+#define inv_mcol(x)       (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = (x) ^ gf_mulx(g4), g4 ^= g9, \
+                          (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ upr(g4, 2) ^ upr(g9, 1))
+#endif
+
+#if defined( FL4_SET )
+#define ls_box(x,c)       four_tables(x,t_use(f,l),vf1,rf2,c)
+#elif   defined( LS4_SET )
+#define ls_box(x,c)       four_tables(x,t_use(l,s),vf1,rf2,c)
+#elif defined( FL1_SET )
+#define ls_box(x,c)       one_table(x,upr,t_use(f,l),vf1,rf2,c)
+#elif defined( LS1_SET )
+#define ls_box(x,c)       one_table(x,upr,t_use(l,s),vf1,rf2,c)
+#else
+#define ls_box(x,c)     no_table(x,t_use(s,box),vf1,rf2,c)
+#endif
+
+#if defined( ASM_X86_V1C ) && defined( AES_DECRYPT ) && !defined( ISB_SET )
+#define ISB_SET
+#endif
+
+#endif
diff --git a/src/Crypto/Aestab.c b/src/Crypto/Aestab.c
index 2fd53789..1effb6f6 100644
--- a/src/Crypto/Aestab.c
+++ b/src/Crypto/Aestab.c
@@ -1,428 +1,428 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 20/12/2007
-*/
-
-/* Adapted for TrueCrypt:
-   - Added run-time table generator for Aes_x86_v2.asm
-*/
-
-#define DO_TABLES
-
-#include "Aes.h"
-#include "Aesopt.h"
-
-#if defined(FIXED_TABLES)
-
-#define sb_data(w) {\
-    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
-    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
-    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
-    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
-    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
-    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
-    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
-    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
-    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
-    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
-    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
-    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
-    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
-    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
-    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
-    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
-    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
-    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
-    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
-    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
-    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
-    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
-    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
-    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
-    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
-    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
-    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
-    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
-    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
-    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
-    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
-    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
-
-#define isb_data(w) {\
-    w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), w(0x38),\
-    w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), w(0xd7), w(0xfb),\
-    w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), w(0x2f), w(0xff), w(0x87),\
-    w(0x34), w(0x8e), w(0x43), w(0x44), w(0xc4), w(0xde), w(0xe9), w(0xcb),\
-    w(0x54), w(0x7b), w(0x94), w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d),\
-    w(0xee), w(0x4c), w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e),\
-    w(0x08), w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2),\
-    w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), w(0x25),\
-    w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), w(0x98), w(0x16),\
-    w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), w(0x65), w(0xb6), w(0x92),\
-    w(0x6c), w(0x70), w(0x48), w(0x50), w(0xfd), w(0xed), w(0xb9), w(0xda),\
-    w(0x5e), w(0x15), w(0x46), w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84),\
-    w(0x90), w(0xd8), w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a),\
-    w(0xf7), w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06),\
-    w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), w(0x02),\
-    w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), w(0x8a), w(0x6b),\
-    w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), w(0x67), w(0xdc), w(0xea),\
-    w(0x97), w(0xf2), w(0xcf), w(0xce), w(0xf0), w(0xb4), w(0xe6), w(0x73),\
-    w(0x96), w(0xac), w(0x74), w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85),\
-    w(0xe2), w(0xf9), w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e),\
-    w(0x47), w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89),\
-    w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), w(0x1b),\
-    w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), w(0x79), w(0x20),\
-    w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), w(0xcd), w(0x5a), w(0xf4),\
-    w(0x1f), w(0xdd), w(0xa8), w(0x33), w(0x88), w(0x07), w(0xc7), w(0x31),\
-    w(0xb1), w(0x12), w(0x10), w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f),\
-    w(0x60), w(0x51), w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d),\
-    w(0x2d), w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef),\
-    w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), w(0xb0),\
-    w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), w(0x99), w(0x61),\
-    w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), w(0x77), w(0xd6), w(0x26),\
-    w(0xe1), w(0x69), w(0x14), w(0x63), w(0x55), w(0x21), w(0x0c), w(0x7d) }
-
-#define mm_data(w) {\
-    w(0x00), w(0x01), w(0x02), w(0x03), w(0x04), w(0x05), w(0x06), w(0x07),\
-    w(0x08), w(0x09), w(0x0a), w(0x0b), w(0x0c), w(0x0d), w(0x0e), w(0x0f),\
-    w(0x10), w(0x11), w(0x12), w(0x13), w(0x14), w(0x15), w(0x16), w(0x17),\
-    w(0x18), w(0x19), w(0x1a), w(0x1b), w(0x1c), w(0x1d), w(0x1e), w(0x1f),\
-    w(0x20), w(0x21), w(0x22), w(0x23), w(0x24), w(0x25), w(0x26), w(0x27),\
-    w(0x28), w(0x29), w(0x2a), w(0x2b), w(0x2c), w(0x2d), w(0x2e), w(0x2f),\
-    w(0x30), w(0x31), w(0x32), w(0x33), w(0x34), w(0x35), w(0x36), w(0x37),\
-    w(0x38), w(0x39), w(0x3a), w(0x3b), w(0x3c), w(0x3d), w(0x3e), w(0x3f),\
-    w(0x40), w(0x41), w(0x42), w(0x43), w(0x44), w(0x45), w(0x46), w(0x47),\
-    w(0x48), w(0x49), w(0x4a), w(0x4b), w(0x4c), w(0x4d), w(0x4e), w(0x4f),\
-    w(0x50), w(0x51), w(0x52), w(0x53), w(0x54), w(0x55), w(0x56), w(0x57),\
-    w(0x58), w(0x59), w(0x5a), w(0x5b), w(0x5c), w(0x5d), w(0x5e), w(0x5f),\
-    w(0x60), w(0x61), w(0x62), w(0x63), w(0x64), w(0x65), w(0x66), w(0x67),\
-    w(0x68), w(0x69), w(0x6a), w(0x6b), w(0x6c), w(0x6d), w(0x6e), w(0x6f),\
-    w(0x70), w(0x71), w(0x72), w(0x73), w(0x74), w(0x75), w(0x76), w(0x77),\
-    w(0x78), w(0x79), w(0x7a), w(0x7b), w(0x7c), w(0x7d), w(0x7e), w(0x7f),\
-    w(0x80), w(0x81), w(0x82), w(0x83), w(0x84), w(0x85), w(0x86), w(0x87),\
-    w(0x88), w(0x89), w(0x8a), w(0x8b), w(0x8c), w(0x8d), w(0x8e), w(0x8f),\
-    w(0x90), w(0x91), w(0x92), w(0x93), w(0x94), w(0x95), w(0x96), w(0x97),\
-    w(0x98), w(0x99), w(0x9a), w(0x9b), w(0x9c), w(0x9d), w(0x9e), w(0x9f),\
-    w(0xa0), w(0xa1), w(0xa2), w(0xa3), w(0xa4), w(0xa5), w(0xa6), w(0xa7),\
-    w(0xa8), w(0xa9), w(0xaa), w(0xab), w(0xac), w(0xad), w(0xae), w(0xaf),\
-    w(0xb0), w(0xb1), w(0xb2), w(0xb3), w(0xb4), w(0xb5), w(0xb6), w(0xb7),\
-    w(0xb8), w(0xb9), w(0xba), w(0xbb), w(0xbc), w(0xbd), w(0xbe), w(0xbf),\
-    w(0xc0), w(0xc1), w(0xc2), w(0xc3), w(0xc4), w(0xc5), w(0xc6), w(0xc7),\
-    w(0xc8), w(0xc9), w(0xca), w(0xcb), w(0xcc), w(0xcd), w(0xce), w(0xcf),\
-    w(0xd0), w(0xd1), w(0xd2), w(0xd3), w(0xd4), w(0xd5), w(0xd6), w(0xd7),\
-    w(0xd8), w(0xd9), w(0xda), w(0xdb), w(0xdc), w(0xdd), w(0xde), w(0xdf),\
-    w(0xe0), w(0xe1), w(0xe2), w(0xe3), w(0xe4), w(0xe5), w(0xe6), w(0xe7),\
-    w(0xe8), w(0xe9), w(0xea), w(0xeb), w(0xec), w(0xed), w(0xee), w(0xef),\
-    w(0xf0), w(0xf1), w(0xf2), w(0xf3), w(0xf4), w(0xf5), w(0xf6), w(0xf7),\
-    w(0xf8), w(0xf9), w(0xfa), w(0xfb), w(0xfc), w(0xfd), w(0xfe), w(0xff) }
-
-#define rc_data(w) {\
-    w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
-    w(0x1b), w(0x36) }
-
-#define h0(x)   (x)
-
-#define w0(p)   bytes2word(p, 0, 0, 0)
-#define w1(p)   bytes2word(0, p, 0, 0)
-#define w2(p)   bytes2word(0, 0, p, 0)
-#define w3(p)   bytes2word(0, 0, 0, p)
-
-#define u0(p)   bytes2word(f2(p), p, p, f3(p))
-#define u1(p)   bytes2word(f3(p), f2(p), p, p)
-#define u2(p)   bytes2word(p, f3(p), f2(p), p)
-#define u3(p)   bytes2word(p, p, f3(p), f2(p))
-
-#define v0(p)   bytes2word(fe(p), f9(p), fd(p), fb(p))
-#define v1(p)   bytes2word(fb(p), fe(p), f9(p), fd(p))
-#define v2(p)   bytes2word(fd(p), fb(p), fe(p), f9(p))
-#define v3(p)   bytes2word(f9(p), fd(p), fb(p), fe(p))
-
-#endif
-
-#if defined(FIXED_TABLES) || !defined(FF_TABLES)
-
-#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
-#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
-#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) \
-                        ^ (((x>>5) & 4) * WPOLY))
-#define f3(x)   (f2(x) ^ x)
-#define f9(x)   (f8(x) ^ x)
-#define fb(x)   (f8(x) ^ f2(x) ^ x)
-#define fd(x)   (f8(x) ^ f4(x) ^ x)
-#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
-
-#else
-
-#define f2(x) ((x) ? pow[log[x] + 0x19] : 0)
-#define f3(x) ((x) ? pow[log[x] + 0x01] : 0)
-#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0)
-#define fb(x) ((x) ? pow[log[x] + 0x68] : 0)
-#define fd(x) ((x) ? pow[log[x] + 0xee] : 0)
-#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0)
-#define fi(x) ((x) ? pow[ 255 - log[x]] : 0)
-
-#endif
-
-#include "Aestab.h"
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-#if defined(FIXED_TABLES)
-
-/* implemented in case of wrong call for fixed tables */
-
-AES_RETURN aes_init(void)
-{
-    return EXIT_SUCCESS;
-}
-
-#else   /* dynamic table generation */
-
-#if !defined(FF_TABLES)
-
-/*  Generate the tables for the dynamic table option
-
-    It will generally be sensible to use tables to compute finite
-    field multiplies and inverses but where memory is scarse this
-    code might sometimes be better. But it only has effect during
-    initialisation so its pretty unimportant in overall terms.
-*/
-
-/*  return 2 ^ (n - 1) where n is the bit number of the highest bit
-    set in x with x in the range 1 < x < 0x00000200.   This form is
-    used so that locals within fi can be bytes rather than words
-*/
-
-static uint_8t hibit(const uint_32t x)
-{   uint_8t r = (uint_8t)((x >> 1) | (x >> 2));
-
-    r |= (r >> 2);
-    r |= (r >> 4);
-    return (r + 1) >> 1;
-}
-
-/* return the inverse of the finite field element x */
-
-static uint_8t fi(const uint_8t x)
-{   uint_8t p1 = x, p2 = BPOLY, n1 = hibit(x), n2 = 0x80, v1 = 1, v2 = 0;
-
-    if(x < 2) return x;
-
-    for(;;)
-    {
-        if(!n1) return v1;
-
-        while(n2 >= n1)
-        {
-            n2 /= n1; p2 ^= p1 * n2; v2 ^= v1 * n2; n2 = hibit(p2);
-        }
-
-        if(!n2) return v2;
-
-        while(n1 >= n2)
-        {
-            n1 /= n2; p1 ^= p2 * n1; v1 ^= v2 * n1; n1 = hibit(p1);
-        }
-    }
-}
-
-#endif
-
-/* The forward and inverse affine transformations used in the S-box */
-
-#define fwd_affine(x) \
-    (w = (uint_32t)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(uint_8t)(w^(w>>8)))
-
-#define inv_affine(x) \
-    (w = (uint_32t)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(uint_8t)(w^(w>>8)))
-
-static int init = 0;
-
-#ifdef TC_WINDOWS_BOOT
-
-#pragma optimize ("l", on)
-uint_8t aes_enc_tab[256][8];
-uint_8t aes_dec_tab[256][8];
-
-#endif
-
-AES_RETURN aes_init(void)
-{   uint_32t  i, w;
-
-#ifdef TC_WINDOWS_BOOT
-
-	if (init)
-		return EXIT_SUCCESS;
-
-    for (i = 0; i < 256; ++i)
-    { 
-        uint_8t x = fwd_affine(fi((uint_8t)i));
-		aes_enc_tab[i][0] = 0;
-		aes_enc_tab[i][1] = x;
-		aes_enc_tab[i][2] = x;
-		aes_enc_tab[i][3] = f3(x);
-		aes_enc_tab[i][4] = f2(x);
-		aes_enc_tab[i][5] = x;
-		aes_enc_tab[i][6] = x;
-		aes_enc_tab[i][7] = f3(x);
-
-        x = fi((uint_8t)inv_affine((uint_8t)i));
-		aes_dec_tab[i][0] = fe(x);
-		aes_dec_tab[i][1] = f9(x);
-		aes_dec_tab[i][2] = fd(x);
-		aes_dec_tab[i][3] = fb(x);
-		aes_dec_tab[i][4] = fe(x);
-		aes_dec_tab[i][5] = f9(x);
-		aes_dec_tab[i][6] = fd(x);
-		aes_dec_tab[i][7] = x;
-    }
-
-#else // TC_WINDOWS_BOOT
-
-#if defined(FF_TABLES)
-
-    uint_8t  pow[512], log[256];
-
-    if(init)
-        return EXIT_SUCCESS;
-    /*  log and power tables for GF(2^8) finite field with
-        WPOLY as modular polynomial - the simplest primitive
-        root is 0x03, used here to generate the tables
-    */
-
-    i = 0; w = 1;
-    do
-    {
-        pow[i] = (uint_8t)w;
-        pow[i + 255] = (uint_8t)w;
-        log[w] = (uint_8t)i++;
-        w ^=  (w << 1) ^ (w & 0x80 ? WPOLY : 0);
-    }
-    while (w != 1);
-
-#else
-    if(init)
-        return EXIT_SUCCESS;
-#endif
-
-    for(i = 0, w = 1; i < RC_LENGTH; ++i)
-    {
-        t_set(r,c)[i] = bytes2word(w, 0, 0, 0);
-        w = f2(w);
-    }
-
-    for(i = 0; i < 256; ++i)
-    {   uint_8t    b;
-
-        b = fwd_affine(fi((uint_8t)i));
-        w = bytes2word(f2(b), b, b, f3(b));
-
-#if defined( SBX_SET )
-        t_set(s,box)[i] = b;
-#endif
-
-#if defined( FT1_SET )                 /* tables for a normal encryption round */
-        t_set(f,n)[i] = w;
-#endif
-#if defined( FT4_SET )
-        t_set(f,n)[0][i] = w;
-        t_set(f,n)[1][i] = upr(w,1);
-        t_set(f,n)[2][i] = upr(w,2);
-        t_set(f,n)[3][i] = upr(w,3);
-#endif
-        w = bytes2word(b, 0, 0, 0);
-
-#if defined( FL1_SET )            /* tables for last encryption round (may also   */
-        t_set(f,l)[i] = w;        /* be used in the key schedule)                 */
-#endif
-#if defined( FL4_SET )
-        t_set(f,l)[0][i] = w;
-        t_set(f,l)[1][i] = upr(w,1);
-        t_set(f,l)[2][i] = upr(w,2);
-        t_set(f,l)[3][i] = upr(w,3);
-#endif
-
-#if defined( LS1_SET )			/* table for key schedule if t_set(f,l) above is*/
-        t_set(l,s)[i] = w;      /* not of the required form                     */
-#endif
-#if defined( LS4_SET )
-        t_set(l,s)[0][i] = w;
-        t_set(l,s)[1][i] = upr(w,1);
-        t_set(l,s)[2][i] = upr(w,2);
-        t_set(l,s)[3][i] = upr(w,3);
-#endif
-
-        b = fi(inv_affine((uint_8t)i));
-        w = bytes2word(fe(b), f9(b), fd(b), fb(b));
-
-#if defined( IM1_SET )			/* tables for the inverse mix column operation  */
-        t_set(i,m)[b] = w;
-#endif
-#if defined( IM4_SET )
-        t_set(i,m)[0][b] = w;
-        t_set(i,m)[1][b] = upr(w,1);
-        t_set(i,m)[2][b] = upr(w,2);
-        t_set(i,m)[3][b] = upr(w,3);
-#endif
-
-#if defined( ISB_SET )
-        t_set(i,box)[i] = b;
-#endif
-#if defined( IT1_SET )			/* tables for a normal decryption round */
-        t_set(i,n)[i] = w;
-#endif
-#if defined( IT4_SET )
-        t_set(i,n)[0][i] = w;
-        t_set(i,n)[1][i] = upr(w,1);
-        t_set(i,n)[2][i] = upr(w,2);
-        t_set(i,n)[3][i] = upr(w,3);
-#endif
-        w = bytes2word(b, 0, 0, 0);
-#if defined( IL1_SET )			/* tables for last decryption round */
-        t_set(i,l)[i] = w;
-#endif
-#if defined( IL4_SET )
-        t_set(i,l)[0][i] = w;
-        t_set(i,l)[1][i] = upr(w,1);
-        t_set(i,l)[2][i] = upr(w,2);
-        t_set(i,l)[3][i] = upr(w,3);
-#endif
-    }
-
-#endif // TC_WINDOWS_BOOT
-
-    init = 1;
-    return EXIT_SUCCESS;
-}
-
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+*/
+
+/* Adapted for TrueCrypt:
+   - Added run-time table generator for Aes_x86_v2.asm
+*/
+
+#define DO_TABLES
+
+#include "Aes.h"
+#include "Aesopt.h"
+
+#if defined(FIXED_TABLES)
+
+#define sb_data(w) {\
+    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
+    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
+    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
+    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
+    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
+    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
+    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
+    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
+    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
+    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
+    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
+    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
+    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
+    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
+    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
+    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
+    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
+    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
+    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
+    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
+    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
+    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
+    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
+    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
+    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
+    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
+    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
+    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
+    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
+    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
+    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
+    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
+
+#define isb_data(w) {\
+    w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), w(0x38),\
+    w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), w(0xd7), w(0xfb),\
+    w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), w(0x2f), w(0xff), w(0x87),\
+    w(0x34), w(0x8e), w(0x43), w(0x44), w(0xc4), w(0xde), w(0xe9), w(0xcb),\
+    w(0x54), w(0x7b), w(0x94), w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d),\
+    w(0xee), w(0x4c), w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e),\
+    w(0x08), w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2),\
+    w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), w(0x25),\
+    w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), w(0x98), w(0x16),\
+    w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), w(0x65), w(0xb6), w(0x92),\
+    w(0x6c), w(0x70), w(0x48), w(0x50), w(0xfd), w(0xed), w(0xb9), w(0xda),\
+    w(0x5e), w(0x15), w(0x46), w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84),\
+    w(0x90), w(0xd8), w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a),\
+    w(0xf7), w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06),\
+    w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), w(0x02),\
+    w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), w(0x8a), w(0x6b),\
+    w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), w(0x67), w(0xdc), w(0xea),\
+    w(0x97), w(0xf2), w(0xcf), w(0xce), w(0xf0), w(0xb4), w(0xe6), w(0x73),\
+    w(0x96), w(0xac), w(0x74), w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85),\
+    w(0xe2), w(0xf9), w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e),\
+    w(0x47), w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89),\
+    w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), w(0x1b),\
+    w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), w(0x79), w(0x20),\
+    w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), w(0xcd), w(0x5a), w(0xf4),\
+    w(0x1f), w(0xdd), w(0xa8), w(0x33), w(0x88), w(0x07), w(0xc7), w(0x31),\
+    w(0xb1), w(0x12), w(0x10), w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f),\
+    w(0x60), w(0x51), w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d),\
+    w(0x2d), w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef),\
+    w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), w(0xb0),\
+    w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), w(0x99), w(0x61),\
+    w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), w(0x77), w(0xd6), w(0x26),\
+    w(0xe1), w(0x69), w(0x14), w(0x63), w(0x55), w(0x21), w(0x0c), w(0x7d) }
+
+#define mm_data(w) {\
+    w(0x00), w(0x01), w(0x02), w(0x03), w(0x04), w(0x05), w(0x06), w(0x07),\
+    w(0x08), w(0x09), w(0x0a), w(0x0b), w(0x0c), w(0x0d), w(0x0e), w(0x0f),\
+    w(0x10), w(0x11), w(0x12), w(0x13), w(0x14), w(0x15), w(0x16), w(0x17),\
+    w(0x18), w(0x19), w(0x1a), w(0x1b), w(0x1c), w(0x1d), w(0x1e), w(0x1f),\
+    w(0x20), w(0x21), w(0x22), w(0x23), w(0x24), w(0x25), w(0x26), w(0x27),\
+    w(0x28), w(0x29), w(0x2a), w(0x2b), w(0x2c), w(0x2d), w(0x2e), w(0x2f),\
+    w(0x30), w(0x31), w(0x32), w(0x33), w(0x34), w(0x35), w(0x36), w(0x37),\
+    w(0x38), w(0x39), w(0x3a), w(0x3b), w(0x3c), w(0x3d), w(0x3e), w(0x3f),\
+    w(0x40), w(0x41), w(0x42), w(0x43), w(0x44), w(0x45), w(0x46), w(0x47),\
+    w(0x48), w(0x49), w(0x4a), w(0x4b), w(0x4c), w(0x4d), w(0x4e), w(0x4f),\
+    w(0x50), w(0x51), w(0x52), w(0x53), w(0x54), w(0x55), w(0x56), w(0x57),\
+    w(0x58), w(0x59), w(0x5a), w(0x5b), w(0x5c), w(0x5d), w(0x5e), w(0x5f),\
+    w(0x60), w(0x61), w(0x62), w(0x63), w(0x64), w(0x65), w(0x66), w(0x67),\
+    w(0x68), w(0x69), w(0x6a), w(0x6b), w(0x6c), w(0x6d), w(0x6e), w(0x6f),\
+    w(0x70), w(0x71), w(0x72), w(0x73), w(0x74), w(0x75), w(0x76), w(0x77),\
+    w(0x78), w(0x79), w(0x7a), w(0x7b), w(0x7c), w(0x7d), w(0x7e), w(0x7f),\
+    w(0x80), w(0x81), w(0x82), w(0x83), w(0x84), w(0x85), w(0x86), w(0x87),\
+    w(0x88), w(0x89), w(0x8a), w(0x8b), w(0x8c), w(0x8d), w(0x8e), w(0x8f),\
+    w(0x90), w(0x91), w(0x92), w(0x93), w(0x94), w(0x95), w(0x96), w(0x97),\
+    w(0x98), w(0x99), w(0x9a), w(0x9b), w(0x9c), w(0x9d), w(0x9e), w(0x9f),\
+    w(0xa0), w(0xa1), w(0xa2), w(0xa3), w(0xa4), w(0xa5), w(0xa6), w(0xa7),\
+    w(0xa8), w(0xa9), w(0xaa), w(0xab), w(0xac), w(0xad), w(0xae), w(0xaf),\
+    w(0xb0), w(0xb1), w(0xb2), w(0xb3), w(0xb4), w(0xb5), w(0xb6), w(0xb7),\
+    w(0xb8), w(0xb9), w(0xba), w(0xbb), w(0xbc), w(0xbd), w(0xbe), w(0xbf),\
+    w(0xc0), w(0xc1), w(0xc2), w(0xc3), w(0xc4), w(0xc5), w(0xc6), w(0xc7),\
+    w(0xc8), w(0xc9), w(0xca), w(0xcb), w(0xcc), w(0xcd), w(0xce), w(0xcf),\
+    w(0xd0), w(0xd1), w(0xd2), w(0xd3), w(0xd4), w(0xd5), w(0xd6), w(0xd7),\
+    w(0xd8), w(0xd9), w(0xda), w(0xdb), w(0xdc), w(0xdd), w(0xde), w(0xdf),\
+    w(0xe0), w(0xe1), w(0xe2), w(0xe3), w(0xe4), w(0xe5), w(0xe6), w(0xe7),\
+    w(0xe8), w(0xe9), w(0xea), w(0xeb), w(0xec), w(0xed), w(0xee), w(0xef),\
+    w(0xf0), w(0xf1), w(0xf2), w(0xf3), w(0xf4), w(0xf5), w(0xf6), w(0xf7),\
+    w(0xf8), w(0xf9), w(0xfa), w(0xfb), w(0xfc), w(0xfd), w(0xfe), w(0xff) }
+
+#define rc_data(w) {\
+    w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
+    w(0x1b), w(0x36) }
+
+#define h0(x)   (x)
+
+#define w0(p)   bytes2word(p, 0, 0, 0)
+#define w1(p)   bytes2word(0, p, 0, 0)
+#define w2(p)   bytes2word(0, 0, p, 0)
+#define w3(p)   bytes2word(0, 0, 0, p)
+
+#define u0(p)   bytes2word(f2(p), p, p, f3(p))
+#define u1(p)   bytes2word(f3(p), f2(p), p, p)
+#define u2(p)   bytes2word(p, f3(p), f2(p), p)
+#define u3(p)   bytes2word(p, p, f3(p), f2(p))
+
+#define v0(p)   bytes2word(fe(p), f9(p), fd(p), fb(p))
+#define v1(p)   bytes2word(fb(p), fe(p), f9(p), fd(p))
+#define v2(p)   bytes2word(fd(p), fb(p), fe(p), f9(p))
+#define v3(p)   bytes2word(f9(p), fd(p), fb(p), fe(p))
+
+#endif
+
+#if defined(FIXED_TABLES) || !defined(FF_TABLES)
+
+#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
+#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
+#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) \
+                        ^ (((x>>5) & 4) * WPOLY))
+#define f3(x)   (f2(x) ^ x)
+#define f9(x)   (f8(x) ^ x)
+#define fb(x)   (f8(x) ^ f2(x) ^ x)
+#define fd(x)   (f8(x) ^ f4(x) ^ x)
+#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
+
+#else
+
+#define f2(x) ((x) ? pow[log[x] + 0x19] : 0)
+#define f3(x) ((x) ? pow[log[x] + 0x01] : 0)
+#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0)
+#define fb(x) ((x) ? pow[log[x] + 0x68] : 0)
+#define fd(x) ((x) ? pow[log[x] + 0xee] : 0)
+#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0)
+#define fi(x) ((x) ? pow[ 255 - log[x]] : 0)
+
+#endif
+
+#include "Aestab.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#if defined(FIXED_TABLES)
+
+/* implemented in case of wrong call for fixed tables */
+
+AES_RETURN aes_init(void)
+{
+    return EXIT_SUCCESS;
+}
+
+#else   /* dynamic table generation */
+
+#if !defined(FF_TABLES)
+
+/*  Generate the tables for the dynamic table option
+
+    It will generally be sensible to use tables to compute finite
+    field multiplies and inverses but where memory is scarse this
+    code might sometimes be better. But it only has effect during
+    initialisation so its pretty unimportant in overall terms.
+*/
+
+/*  return 2 ^ (n - 1) where n is the bit number of the highest bit
+    set in x with x in the range 1 < x < 0x00000200.   This form is
+    used so that locals within fi can be bytes rather than words
+*/
+
+static uint_8t hibit(const uint_32t x)
+{   uint_8t r = (uint_8t)((x >> 1) | (x >> 2));
+
+    r |= (r >> 2);
+    r |= (r >> 4);
+    return (r + 1) >> 1;
+}
+
+/* return the inverse of the finite field element x */
+
+static uint_8t fi(const uint_8t x)
+{   uint_8t p1 = x, p2 = BPOLY, n1 = hibit(x), n2 = 0x80, v1 = 1, v2 = 0;
+
+    if(x < 2) return x;
+
+    for(;;)
+    {
+        if(!n1) return v1;
+
+        while(n2 >= n1)
+        {
+            n2 /= n1; p2 ^= p1 * n2; v2 ^= v1 * n2; n2 = hibit(p2);
+        }
+
+        if(!n2) return v2;
+
+        while(n1 >= n2)
+        {
+            n1 /= n2; p1 ^= p2 * n1; v1 ^= v2 * n1; n1 = hibit(p1);
+        }
+    }
+}
+
+#endif
+
+/* The forward and inverse affine transformations used in the S-box */
+
+#define fwd_affine(x) \
+    (w = (uint_32t)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(uint_8t)(w^(w>>8)))
+
+#define inv_affine(x) \
+    (w = (uint_32t)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(uint_8t)(w^(w>>8)))
+
+static int init = 0;
+
+#ifdef TC_WINDOWS_BOOT
+
+#pragma optimize ("l", on)
+uint_8t aes_enc_tab[256][8];
+uint_8t aes_dec_tab[256][8];
+
+#endif
+
+AES_RETURN aes_init(void)
+{   uint_32t  i, w;
+
+#ifdef TC_WINDOWS_BOOT
+
+	if (init)
+		return EXIT_SUCCESS;
+
+    for (i = 0; i < 256; ++i)
+    { 
+        uint_8t x = fwd_affine(fi((uint_8t)i));
+		aes_enc_tab[i][0] = 0;
+		aes_enc_tab[i][1] = x;
+		aes_enc_tab[i][2] = x;
+		aes_enc_tab[i][3] = f3(x);
+		aes_enc_tab[i][4] = f2(x);
+		aes_enc_tab[i][5] = x;
+		aes_enc_tab[i][6] = x;
+		aes_enc_tab[i][7] = f3(x);
+
+        x = fi((uint_8t)inv_affine((uint_8t)i));
+		aes_dec_tab[i][0] = fe(x);
+		aes_dec_tab[i][1] = f9(x);
+		aes_dec_tab[i][2] = fd(x);
+		aes_dec_tab[i][3] = fb(x);
+		aes_dec_tab[i][4] = fe(x);
+		aes_dec_tab[i][5] = f9(x);
+		aes_dec_tab[i][6] = fd(x);
+		aes_dec_tab[i][7] = x;
+    }
+
+#else // TC_WINDOWS_BOOT
+
+#if defined(FF_TABLES)
+
+    uint_8t  pow[512], log[256];
+
+    if(init)
+        return EXIT_SUCCESS;
+    /*  log and power tables for GF(2^8) finite field with
+        WPOLY as modular polynomial - the simplest primitive
+        root is 0x03, used here to generate the tables
+    */
+
+    i = 0; w = 1;
+    do
+    {
+        pow[i] = (uint_8t)w;
+        pow[i + 255] = (uint_8t)w;
+        log[w] = (uint_8t)i++;
+        w ^=  (w << 1) ^ (w & 0x80 ? WPOLY : 0);
+    }
+    while (w != 1);
+
+#else
+    if(init)
+        return EXIT_SUCCESS;
+#endif
+
+    for(i = 0, w = 1; i < RC_LENGTH; ++i)
+    {
+        t_set(r,c)[i] = bytes2word(w, 0, 0, 0);
+        w = f2(w);
+    }
+
+    for(i = 0; i < 256; ++i)
+    {   uint_8t    b;
+
+        b = fwd_affine(fi((uint_8t)i));
+        w = bytes2word(f2(b), b, b, f3(b));
+
+#if defined( SBX_SET )
+        t_set(s,box)[i] = b;
+#endif
+
+#if defined( FT1_SET )                 /* tables for a normal encryption round */
+        t_set(f,n)[i] = w;
+#endif
+#if defined( FT4_SET )
+        t_set(f,n)[0][i] = w;
+        t_set(f,n)[1][i] = upr(w,1);
+        t_set(f,n)[2][i] = upr(w,2);
+        t_set(f,n)[3][i] = upr(w,3);
+#endif
+        w = bytes2word(b, 0, 0, 0);
+
+#if defined( FL1_SET )            /* tables for last encryption round (may also   */
+        t_set(f,l)[i] = w;        /* be used in the key schedule)                 */
+#endif
+#if defined( FL4_SET )
+        t_set(f,l)[0][i] = w;
+        t_set(f,l)[1][i] = upr(w,1);
+        t_set(f,l)[2][i] = upr(w,2);
+        t_set(f,l)[3][i] = upr(w,3);
+#endif
+
+#if defined( LS1_SET )			/* table for key schedule if t_set(f,l) above is*/
+        t_set(l,s)[i] = w;      /* not of the required form                     */
+#endif
+#if defined( LS4_SET )
+        t_set(l,s)[0][i] = w;
+        t_set(l,s)[1][i] = upr(w,1);
+        t_set(l,s)[2][i] = upr(w,2);
+        t_set(l,s)[3][i] = upr(w,3);
+#endif
+
+        b = fi(inv_affine((uint_8t)i));
+        w = bytes2word(fe(b), f9(b), fd(b), fb(b));
+
+#if defined( IM1_SET )			/* tables for the inverse mix column operation  */
+        t_set(i,m)[b] = w;
+#endif
+#if defined( IM4_SET )
+        t_set(i,m)[0][b] = w;
+        t_set(i,m)[1][b] = upr(w,1);
+        t_set(i,m)[2][b] = upr(w,2);
+        t_set(i,m)[3][b] = upr(w,3);
+#endif
+
+#if defined( ISB_SET )
+        t_set(i,box)[i] = b;
+#endif
+#if defined( IT1_SET )			/* tables for a normal decryption round */
+        t_set(i,n)[i] = w;
+#endif
+#if defined( IT4_SET )
+        t_set(i,n)[0][i] = w;
+        t_set(i,n)[1][i] = upr(w,1);
+        t_set(i,n)[2][i] = upr(w,2);
+        t_set(i,n)[3][i] = upr(w,3);
+#endif
+        w = bytes2word(b, 0, 0, 0);
+#if defined( IL1_SET )			/* tables for last decryption round */
+        t_set(i,l)[i] = w;
+#endif
+#if defined( IL4_SET )
+        t_set(i,l)[0][i] = w;
+        t_set(i,l)[1][i] = upr(w,1);
+        t_set(i,l)[2][i] = upr(w,2);
+        t_set(i,l)[3][i] = upr(w,3);
+#endif
+    }
+
+#endif // TC_WINDOWS_BOOT
+
+    init = 1;
+    return EXIT_SUCCESS;
+}
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
diff --git a/src/Crypto/Aestab.h b/src/Crypto/Aestab.h
index 2ad1b034..e52e0057 100644
--- a/src/Crypto/Aestab.h
+++ b/src/Crypto/Aestab.h
@@ -1,174 +1,174 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 20/12/2007
-
- This file contains the code for declaring the tables needed to implement
- AES. The file aesopt.h is assumed to be included before this header file.
- If there are no global variables, the definitions here can be used to put
- the AES tables in a structure so that a pointer can then be added to the
- AES context to pass them to the AES routines that need them.   If this
- facility is used, the calling program has to ensure that this pointer is
- managed appropriately.  In particular, the value of the t_dec(in,it) item
- in the table structure must be set to zero in order to ensure that the
- tables are initialised. In practice the three code sequences in aeskey.c
- that control the calls to aes_init() and the aes_init() routine itself will
- have to be changed for a specific implementation. If global variables are
- available it will generally be preferable to use them with the precomputed
- FIXED_TABLES option that uses static global tables.
-
- The following defines can be used to control the way the tables
- are defined, initialised and used in embedded environments that
- require special features for these purposes
-
-    the 't_dec' construction is used to declare fixed table arrays
-    the 't_set' construction is used to set fixed table values
-    the 't_use' construction is used to access fixed table values
-
-    256 byte tables:
-
-        t_xxx(s,box)    => forward S box
-        t_xxx(i,box)    => inverse S box
-
-    256 32-bit word OR 4 x 256 32-bit word tables:
-
-        t_xxx(f,n)      => forward normal round
-        t_xxx(f,l)      => forward last round
-        t_xxx(i,n)      => inverse normal round
-        t_xxx(i,l)      => inverse last round
-        t_xxx(l,s)      => key schedule table
-        t_xxx(i,m)      => key schedule table
-
-    Other variables and tables:
-
-        t_xxx(r,c)      => the rcon table
-*/
-
-#if !defined( _AESTAB_H )
-#define _AESTAB_H
-
-#define t_dec(m,n) t_##m##n
-#define t_set(m,n) t_##m##n
-#define t_use(m,n) t_##m##n
-
-#if defined(FIXED_TABLES)
-#  if !defined( __GNUC__ ) && (defined( __MSDOS__ ) || defined( __WIN16__ ))
-/*   make tables far data to avoid using too much DGROUP space (PG) */
-#    define CONST const far
-#  else
-#    define CONST const
-#  endif
-#else
-#  define CONST
-#endif
-
-#if defined(__cplusplus)
-#  define EXTERN extern "C"
-#elif defined(DO_TABLES)
-#  define EXTERN
-#else
-#  define EXTERN extern
-#endif
-
-#if defined(_MSC_VER) && defined(TABLE_ALIGN)
-#define ALIGN __declspec(align(TABLE_ALIGN))
-#else
-#define ALIGN
-#endif
-
-#if defined( __WATCOMC__ ) && ( __WATCOMC__ >= 1100 )
-#  define XP_DIR __cdecl
-#else
-#  define XP_DIR
-#endif
-
-#if defined(DO_TABLES) && defined(FIXED_TABLES)
-#define d_1(t,n,b,e)       EXTERN ALIGN CONST XP_DIR t n[256]    =   b(e)
-#define d_4(t,n,b,e,f,g,h) EXTERN ALIGN CONST XP_DIR t n[4][256] = { b(e), b(f), b(g), b(h) }
-EXTERN ALIGN CONST uint_32t t_dec(r,c)[RC_LENGTH] = rc_data(w0);
-#else
-#define d_1(t,n,b,e)       EXTERN ALIGN CONST XP_DIR t n[256]
-#define d_4(t,n,b,e,f,g,h) EXTERN ALIGN CONST XP_DIR t n[4][256]
-EXTERN ALIGN CONST uint_32t t_dec(r,c)[RC_LENGTH];
-#endif
-
-#if defined( SBX_SET )
-    d_1(uint_8t, t_dec(s,box), sb_data, h0);
-#endif
-#if defined( ISB_SET )
-    d_1(uint_8t, t_dec(i,box), isb_data, h0);
-#endif
-
-#if defined( FT1_SET )
-    d_1(uint_32t, t_dec(f,n), sb_data, u0);
-#endif
-#if defined( FT4_SET )
-    d_4(uint_32t, t_dec(f,n), sb_data, u0, u1, u2, u3);
-#endif
-
-#if defined( FL1_SET )
-    d_1(uint_32t, t_dec(f,l), sb_data, w0);
-#endif
-#if defined( FL4_SET )
-    d_4(uint_32t, t_dec(f,l), sb_data, w0, w1, w2, w3);
-#endif
-
-#if defined( IT1_SET )
-    d_1(uint_32t, t_dec(i,n), isb_data, v0);
-#endif
-#if defined( IT4_SET )
-    d_4(uint_32t, t_dec(i,n), isb_data, v0, v1, v2, v3);
-#endif
-
-#if defined( IL1_SET )
-    d_1(uint_32t, t_dec(i,l), isb_data, w0);
-#endif
-#if defined( IL4_SET )
-    d_4(uint_32t, t_dec(i,l), isb_data, w0, w1, w2, w3);
-#endif
-
-#if defined( LS1_SET )
-#if defined( FL1_SET )
-#undef  LS1_SET
-#else
-    d_1(uint_32t, t_dec(l,s), sb_data, w0);
-#endif
-#endif
-
-#if defined( LS4_SET )
-#if defined( FL4_SET )
-#undef  LS4_SET
-#else
-    d_4(uint_32t, t_dec(l,s), sb_data, w0, w1, w2, w3);
-#endif
-#endif
-
-#if defined( IM1_SET )
-    d_1(uint_32t, t_dec(i,m), mm_data, v0);
-#endif
-#if defined( IM4_SET )
-    d_4(uint_32t, t_dec(i,m), mm_data, v0, v1, v2, v3);
-#endif
-
-#endif
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+
+ This file contains the code for declaring the tables needed to implement
+ AES. The file aesopt.h is assumed to be included before this header file.
+ If there are no global variables, the definitions here can be used to put
+ the AES tables in a structure so that a pointer can then be added to the
+ AES context to pass them to the AES routines that need them.   If this
+ facility is used, the calling program has to ensure that this pointer is
+ managed appropriately.  In particular, the value of the t_dec(in,it) item
+ in the table structure must be set to zero in order to ensure that the
+ tables are initialised. In practice the three code sequences in aeskey.c
+ that control the calls to aes_init() and the aes_init() routine itself will
+ have to be changed for a specific implementation. If global variables are
+ available it will generally be preferable to use them with the precomputed
+ FIXED_TABLES option that uses static global tables.
+
+ The following defines can be used to control the way the tables
+ are defined, initialised and used in embedded environments that
+ require special features for these purposes
+
+    the 't_dec' construction is used to declare fixed table arrays
+    the 't_set' construction is used to set fixed table values
+    the 't_use' construction is used to access fixed table values
+
+    256 byte tables:
+
+        t_xxx(s,box)    => forward S box
+        t_xxx(i,box)    => inverse S box
+
+    256 32-bit word OR 4 x 256 32-bit word tables:
+
+        t_xxx(f,n)      => forward normal round
+        t_xxx(f,l)      => forward last round
+        t_xxx(i,n)      => inverse normal round
+        t_xxx(i,l)      => inverse last round
+        t_xxx(l,s)      => key schedule table
+        t_xxx(i,m)      => key schedule table
+
+    Other variables and tables:
+
+        t_xxx(r,c)      => the rcon table
+*/
+
+#if !defined( _AESTAB_H )
+#define _AESTAB_H
+
+#define t_dec(m,n) t_##m##n
+#define t_set(m,n) t_##m##n
+#define t_use(m,n) t_##m##n
+
+#if defined(FIXED_TABLES)
+#  if !defined( __GNUC__ ) && (defined( __MSDOS__ ) || defined( __WIN16__ ))
+/*   make tables far data to avoid using too much DGROUP space (PG) */
+#    define CONST const far
+#  else
+#    define CONST const
+#  endif
+#else
+#  define CONST
+#endif
+
+#if defined(__cplusplus)
+#  define EXTERN extern "C"
+#elif defined(DO_TABLES)
+#  define EXTERN
+#else
+#  define EXTERN extern
+#endif
+
+#if defined(_MSC_VER) && defined(TABLE_ALIGN)
+#define ALIGN __declspec(align(TABLE_ALIGN))
+#else
+#define ALIGN
+#endif
+
+#if defined( __WATCOMC__ ) && ( __WATCOMC__ >= 1100 )
+#  define XP_DIR __cdecl
+#else
+#  define XP_DIR
+#endif
+
+#if defined(DO_TABLES) && defined(FIXED_TABLES)
+#define d_1(t,n,b,e)       EXTERN ALIGN CONST XP_DIR t n[256]    =   b(e)
+#define d_4(t,n,b,e,f,g,h) EXTERN ALIGN CONST XP_DIR t n[4][256] = { b(e), b(f), b(g), b(h) }
+EXTERN ALIGN CONST uint_32t t_dec(r,c)[RC_LENGTH] = rc_data(w0);
+#else
+#define d_1(t,n,b,e)       EXTERN ALIGN CONST XP_DIR t n[256]
+#define d_4(t,n,b,e,f,g,h) EXTERN ALIGN CONST XP_DIR t n[4][256]
+EXTERN ALIGN CONST uint_32t t_dec(r,c)[RC_LENGTH];
+#endif
+
+#if defined( SBX_SET )
+    d_1(uint_8t, t_dec(s,box), sb_data, h0);
+#endif
+#if defined( ISB_SET )
+    d_1(uint_8t, t_dec(i,box), isb_data, h0);
+#endif
+
+#if defined( FT1_SET )
+    d_1(uint_32t, t_dec(f,n), sb_data, u0);
+#endif
+#if defined( FT4_SET )
+    d_4(uint_32t, t_dec(f,n), sb_data, u0, u1, u2, u3);
+#endif
+
+#if defined( FL1_SET )
+    d_1(uint_32t, t_dec(f,l), sb_data, w0);
+#endif
+#if defined( FL4_SET )
+    d_4(uint_32t, t_dec(f,l), sb_data, w0, w1, w2, w3);
+#endif
+
+#if defined( IT1_SET )
+    d_1(uint_32t, t_dec(i,n), isb_data, v0);
+#endif
+#if defined( IT4_SET )
+    d_4(uint_32t, t_dec(i,n), isb_data, v0, v1, v2, v3);
+#endif
+
+#if defined( IL1_SET )
+    d_1(uint_32t, t_dec(i,l), isb_data, w0);
+#endif
+#if defined( IL4_SET )
+    d_4(uint_32t, t_dec(i,l), isb_data, w0, w1, w2, w3);
+#endif
+
+#if defined( LS1_SET )
+#if defined( FL1_SET )
+#undef  LS1_SET
+#else
+    d_1(uint_32t, t_dec(l,s), sb_data, w0);
+#endif
+#endif
+
+#if defined( LS4_SET )
+#if defined( FL4_SET )
+#undef  LS4_SET
+#else
+    d_4(uint_32t, t_dec(l,s), sb_data, w0, w1, w2, w3);
+#endif
+#endif
+
+#if defined( IM1_SET )
+    d_1(uint_32t, t_dec(i,m), mm_data, v0);
+#endif
+#if defined( IM4_SET )
+    d_4(uint_32t, t_dec(i,m), mm_data, v0, v1, v2, v3);
+#endif
+
+#endif
diff --git a/src/Crypto/Crypto.vcproj b/src/Crypto/Crypto.vcproj
index 24b012c5..50f67a11 100644
--- a/src/Crypto/Crypto.vcproj
+++ b/src/Crypto/Crypto.vcproj
@@ -1,517 +1,517 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
-	ProjectType="Visual C++"
-	Version="9.00"
-	Name="Crypto"
-	ProjectGUID="{993245CF-6B70-47EE-91BB-39F8FC6DC0E7}"
-	RootNamespace="Crypto"
-	Keyword="Win32Proj"
-	TargetFrameworkVersion="131072"
-	>
-	<Platforms>
-		<Platform
-			Name="Win32"
-		/>
-		<Platform
-			Name="x64"
-		/>
-	</Platforms>
-	<ToolFiles>
-	</ToolFiles>
-	<Configurations>
-		<Configuration
-			Name="Debug|Win32"
-			OutputDirectory="Debug"
-			IntermediateDirectory="Debug"
-			ConfigurationType="4"
-			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
-			CharacterSet="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories="&quot;$(ProjectDir)\..&quot;;&quot;$(ProjectDir)\..\Common&quot;"
-				PreprocessorDefinitions="WIN32;DEBUG;_DEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NON_CONFORMING_SWPRINTFS"
-				MinimalRebuild="true"
-				BasicRuntimeChecks="0"
-				RuntimeLibrary="1"
-				BufferSecurityCheck="false"
-				UsePrecompiledHeader="0"
-				WarningLevel="4"
-				DebugInformationFormat="3"
-				DisableSpecificWarnings="4100;4127;4201"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLibrarianTool"
-				OutputFile="$(OutDir)/Crypto.lib"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-			/>
-		</Configuration>
-		<Configuration
-			Name="Debug|x64"
-			OutputDirectory="$(PlatformName)\$(ConfigurationName)"
-			IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
-			ConfigurationType="4"
-			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
-			CharacterSet="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-				TargetEnvironment="3"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories="&quot;$(ProjectDir)\..&quot;;&quot;$(ProjectDir)\..\Common&quot;"
-				PreprocessorDefinitions="WIN32;DEBUG;_DEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NON_CONFORMING_SWPRINTFS"
-				MinimalRebuild="true"
-				BasicRuntimeChecks="0"
-				RuntimeLibrary="1"
-				BufferSecurityCheck="false"
-				UsePrecompiledHeader="0"
-				WarningLevel="4"
-				DebugInformationFormat="3"
-				DisableSpecificWarnings="4100;4127;4201"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLibrarianTool"
-				OutputFile="$(OutDir)/Crypto.lib"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-			/>
-		</Configuration>
-		<Configuration
-			Name="Release|Win32"
-			OutputDirectory="Release"
-			IntermediateDirectory="Release"
-			ConfigurationType="4"
-			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
-			CharacterSet="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="2"
-				AdditionalIncludeDirectories="&quot;$(ProjectDir)\..&quot;;&quot;$(ProjectDir)\..\Common&quot;"
-				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NON_CONFORMING_SWPRINTFS"
-				RuntimeLibrary="0"
-				BufferSecurityCheck="true"
-				UsePrecompiledHeader="0"
-				AssemblerOutput="2"
-				AssemblerListingLocation="$(IntDir)/"
-				WarningLevel="4"
-				Detect64BitPortabilityProblems="false"
-				DebugInformationFormat="0"
-				DisableSpecificWarnings="4100;4127;4201"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLibrarianTool"
-				OutputFile="$(OutDir)/Crypto.lib"
-				AdditionalLibraryDirectories="$(TargetDir)"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-			/>
-		</Configuration>
-		<Configuration
-			Name="Release|x64"
-			OutputDirectory="$(PlatformName)\$(ConfigurationName)"
-			IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
-			ConfigurationType="4"
-			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
-			CharacterSet="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-				TargetEnvironment="3"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="2"
-				AdditionalIncludeDirectories="&quot;$(ProjectDir)\..&quot;;&quot;$(ProjectDir)\..\Common&quot;"
-				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NON_CONFORMING_SWPRINTFS"
-				RuntimeLibrary="0"
-				BufferSecurityCheck="true"
-				UsePrecompiledHeader="0"
-				AssemblerOutput="2"
-				AssemblerListingLocation="$(IntDir)/"
-				WarningLevel="4"
-				Detect64BitPortabilityProblems="false"
-				DebugInformationFormat="0"
-				DisableSpecificWarnings="4100;4127;4201"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLibrarianTool"
-				OutputFile="$(OutDir)/Crypto.lib"
-				AdditionalLibraryDirectories="$(TargetDir)"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-			/>
-		</Configuration>
-	</Configurations>
-	<References>
-	</References>
-	<Files>
-		<Filter
-			Name="Source Files"
-			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
-			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
-			>
-			<File
-				RelativePath=".\Aes_hw_cpu.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox -g --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-						Outputs="$(TargetDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win64 -Ox -g -o &quot;$(TargetDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-						Outputs="$(TargetDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-						Outputs="$(TargetDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win64 -Ox -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-						Outputs="$(TargetDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath=".\Aes_x64.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win64 -Ox -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-						Outputs="$(TargetDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win64 -Ox -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-						Outputs="$(TargetDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath=".\Aes_x86.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox -g --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-						Outputs="$(TargetDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox -g --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-						Outputs="$(TargetDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-						Outputs="$(TargetDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
-						Outputs="$(TargetDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath=".\Aeskey.c"
-				>
-			</File>
-			<File
-				RelativePath=".\Aestab.c"
-				>
-			</File>
-			<File
-				RelativePath=".\cpu.c"
-				>
-			</File>
-			<File
-				RelativePath=".\Rmd160.c"
-				>
-			</File>
-			<File
-				RelativePath=".\Serpent.c"
-				>
-			</File>
-			<File
-				RelativePath=".\Sha2.c"
-				>
-			</File>
-			<File
-				RelativePath=".\Twofish.c"
-				>
-			</File>
-			<File
-				RelativePath=".\Whirlpool.c"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Header Files"
-			Filter="h;hpp;hxx;hm;inl;inc;xsd"
-			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
-			>
-			<File
-				RelativePath=".\Aes.h"
-				>
-			</File>
-			<File
-				RelativePath=".\Aes_hw_cpu.h"
-				>
-			</File>
-			<File
-				RelativePath=".\Aesopt.h"
-				>
-			</File>
-			<File
-				RelativePath=".\Aestab.h"
-				>
-			</File>
-			<File
-				RelativePath=".\config.h"
-				>
-			</File>
-			<File
-				RelativePath=".\cpu.h"
-				>
-			</File>
-			<File
-				RelativePath=".\misc.h"
-				>
-			</File>
-			<File
-				RelativePath=".\Rmd160.h"
-				>
-			</File>
-			<File
-				RelativePath=".\Serpent.h"
-				>
-			</File>
-			<File
-				RelativePath=".\Sha2.h"
-				>
-			</File>
-			<File
-				RelativePath=".\Twofish.h"
-				>
-			</File>
-			<File
-				RelativePath=".\Whirlpool.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Resource Files"
-			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
-			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
-			>
-		</Filter>
-	</Files>
-	<Globals>
-	</Globals>
-</VisualStudioProject>
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="Crypto"
+	ProjectGUID="{993245CF-6B70-47EE-91BB-39F8FC6DC0E7}"
+	RootNamespace="Crypto"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="131072"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+		<Platform
+			Name="x64"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="Debug"
+			IntermediateDirectory="Debug"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="&quot;$(ProjectDir)\..&quot;;&quot;$(ProjectDir)\..\Common&quot;"
+				PreprocessorDefinitions="WIN32;DEBUG;_DEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NON_CONFORMING_SWPRINTFS"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="0"
+				RuntimeLibrary="1"
+				BufferSecurityCheck="false"
+				UsePrecompiledHeader="0"
+				WarningLevel="4"
+				DebugInformationFormat="3"
+				DisableSpecificWarnings="4100;4127;4201"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				OutputFile="$(OutDir)/Crypto.lib"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Debug|x64"
+			OutputDirectory="$(PlatformName)\$(ConfigurationName)"
+			IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="&quot;$(ProjectDir)\..&quot;;&quot;$(ProjectDir)\..\Common&quot;"
+				PreprocessorDefinitions="WIN32;DEBUG;_DEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NON_CONFORMING_SWPRINTFS"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="0"
+				RuntimeLibrary="1"
+				BufferSecurityCheck="false"
+				UsePrecompiledHeader="0"
+				WarningLevel="4"
+				DebugInformationFormat="3"
+				DisableSpecificWarnings="4100;4127;4201"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				OutputFile="$(OutDir)/Crypto.lib"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="Release"
+			IntermediateDirectory="Release"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				AdditionalIncludeDirectories="&quot;$(ProjectDir)\..&quot;;&quot;$(ProjectDir)\..\Common&quot;"
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NON_CONFORMING_SWPRINTFS"
+				RuntimeLibrary="0"
+				BufferSecurityCheck="true"
+				UsePrecompiledHeader="0"
+				AssemblerOutput="2"
+				AssemblerListingLocation="$(IntDir)/"
+				WarningLevel="4"
+				Detect64BitPortabilityProblems="false"
+				DebugInformationFormat="0"
+				DisableSpecificWarnings="4100;4127;4201"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				OutputFile="$(OutDir)/Crypto.lib"
+				AdditionalLibraryDirectories="$(TargetDir)"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|x64"
+			OutputDirectory="$(PlatformName)\$(ConfigurationName)"
+			IntermediateDirectory="$(PlatformName)\$(ConfigurationName)"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				AdditionalIncludeDirectories="&quot;$(ProjectDir)\..&quot;;&quot;$(ProjectDir)\..\Common&quot;"
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NON_CONFORMING_SWPRINTFS"
+				RuntimeLibrary="0"
+				BufferSecurityCheck="true"
+				UsePrecompiledHeader="0"
+				AssemblerOutput="2"
+				AssemblerListingLocation="$(IntDir)/"
+				WarningLevel="4"
+				Detect64BitPortabilityProblems="false"
+				DebugInformationFormat="0"
+				DisableSpecificWarnings="4100;4127;4201"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				OutputFile="$(OutDir)/Crypto.lib"
+				AdditionalLibraryDirectories="$(TargetDir)"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath=".\Aes_hw_cpu.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox -g --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
+						Outputs="$(TargetDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win64 -Ox -g -o &quot;$(TargetDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
+						Outputs="$(TargetDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
+						Outputs="$(TargetDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win64 -Ox -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
+						Outputs="$(TargetDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\Aes_x64.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					ExcludedFromBuild="true"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win64 -Ox -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
+						Outputs="$(TargetDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					ExcludedFromBuild="true"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win64 -Ox -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
+						Outputs="$(TargetDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\Aes_x86.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox -g --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
+						Outputs="$(TargetDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					ExcludedFromBuild="true"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox -g --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
+						Outputs="$(TargetDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
+						Outputs="$(TargetDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					ExcludedFromBuild="true"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="echo $(InputFileName) &amp; nasm.exe -Xvc -f win32 -Ox --prefix _ -o &quot;$(TargetDir)\$(InputName).obj&quot; -l &quot;$(TargetDir)\$(InputName).lst&quot; &quot;$(InputPath)&quot;&#x0D;&#x0A;"
+						Outputs="$(TargetDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\Aeskey.c"
+				>
+			</File>
+			<File
+				RelativePath=".\Aestab.c"
+				>
+			</File>
+			<File
+				RelativePath=".\cpu.c"
+				>
+			</File>
+			<File
+				RelativePath=".\Rmd160.c"
+				>
+			</File>
+			<File
+				RelativePath=".\Serpent.c"
+				>
+			</File>
+			<File
+				RelativePath=".\Sha2.c"
+				>
+			</File>
+			<File
+				RelativePath=".\Twofish.c"
+				>
+			</File>
+			<File
+				RelativePath=".\Whirlpool.c"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath=".\Aes.h"
+				>
+			</File>
+			<File
+				RelativePath=".\Aes_hw_cpu.h"
+				>
+			</File>
+			<File
+				RelativePath=".\Aesopt.h"
+				>
+			</File>
+			<File
+				RelativePath=".\Aestab.h"
+				>
+			</File>
+			<File
+				RelativePath=".\config.h"
+				>
+			</File>
+			<File
+				RelativePath=".\cpu.h"
+				>
+			</File>
+			<File
+				RelativePath=".\misc.h"
+				>
+			</File>
+			<File
+				RelativePath=".\Rmd160.h"
+				>
+			</File>
+			<File
+				RelativePath=".\Serpent.h"
+				>
+			</File>
+			<File
+				RelativePath=".\Sha2.h"
+				>
+			</File>
+			<File
+				RelativePath=".\Twofish.h"
+				>
+			</File>
+			<File
+				RelativePath=".\Whirlpool.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+			>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/src/Crypto/Makefile b/src/Crypto/Makefile
index 53b9a3d6..5acbbd24 100644
--- a/src/Crypto/Makefile
+++ b/src/Crypto/Makefile
@@ -1 +1 @@
-!INCLUDE $(NTMAKEENV)\makefile.def
+!INCLUDE $(NTMAKEENV)\makefile.def
diff --git a/src/Crypto/Makefile.inc b/src/Crypto/Makefile.inc
index 51c4f46d..955f2a76 100644
--- a/src/Crypto/Makefile.inc
+++ b/src/Crypto/Makefile.inc
@@ -1,15 +1,15 @@
-TC_ASFLAGS = -Xvc -Ox
-
-!if "$(TC_ARCH)" == "x86"
-TC_ASFLAGS = $(TC_ASFLAGS) -f win32 --prefix _ -D MS_STDCALL -D DLL_EXPORT
-!else
-TC_ASFLAGS = $(TC_ASFLAGS) -f win64
-!endif
-
-TC_ASM_ERR_LOG = ..\Driver\build_errors_asm.log
-
-"$(OBJ_PATH)\$(O)\Aes_$(TC_ARCH).obj": Aes_$(TC_ARCH).asm
-	nasm.exe $(TC_ASFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Aes_$(TC_ARCH).lst" Aes_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG)
-
-"$(OBJ_PATH)\$(O)\Aes_hw_cpu.obj": Aes_hw_cpu.asm
-	nasm.exe $(TC_ASFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Aes_hw_cpu.lst" Aes_hw_cpu.asm 2>$(TC_ASM_ERR_LOG)
+TC_ASFLAGS = -Xvc -Ox
+
+!if "$(TC_ARCH)" == "x86"
+TC_ASFLAGS = $(TC_ASFLAGS) -f win32 --prefix _ -D MS_STDCALL -D DLL_EXPORT
+!else
+TC_ASFLAGS = $(TC_ASFLAGS) -f win64
+!endif
+
+TC_ASM_ERR_LOG = ..\Driver\build_errors_asm.log
+
+"$(OBJ_PATH)\$(O)\Aes_$(TC_ARCH).obj": Aes_$(TC_ARCH).asm
+	nasm.exe $(TC_ASFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Aes_$(TC_ARCH).lst" Aes_$(TC_ARCH).asm 2>$(TC_ASM_ERR_LOG)
+
+"$(OBJ_PATH)\$(O)\Aes_hw_cpu.obj": Aes_hw_cpu.asm
+	nasm.exe $(TC_ASFLAGS) -o "$@" -l "$(OBJ_PATH)\$(O)\Aes_hw_cpu.lst" Aes_hw_cpu.asm 2>$(TC_ASM_ERR_LOG)
diff --git a/src/Crypto/Rmd160.c b/src/Crypto/Rmd160.c
index f94f5e08..75a34c3e 100644
--- a/src/Crypto/Rmd160.c
+++ b/src/Crypto/Rmd160.c
@@ -1,498 +1,498 @@
-// RIPEMD-160 written and placed in the public domain by Wei Dai
-
-/*
- * This code implements the MD4 message-digest algorithm.
- * The algorithm is due to Ron Rivest.  This code was
- * written by Colin Plumb in 1993, no copyright is claimed.
- * This code is in the public domain; do with it what you wish.
- */
-
-/* Adapted for TrueCrypt */
-/* Adapted for VeraCrypt */
-
-#include <memory.h>
-#include "Common/Tcdefs.h"
-#include "Common/Endian.h"
-#include "Rmd160.h"
-
-#define F(x, y, z)    (x ^ y ^ z) 
-#define G(x, y, z)    (z ^ (x & (y^z)))
-#define H(x, y, z)    (z ^ (x | ~y))
-#define I(x, y, z)    (y ^ (z & (x^y)))
-#define J(x, y, z)    (x ^ (y | ~z))
-
-#define PUT_64BIT_LE(cp, value) do {                                    \
-	(cp)[7] = (byte) ((value) >> 56);                                        \
-	(cp)[6] = (byte) ((value) >> 48);                                        \
-	(cp)[5] = (byte) ((value) >> 40);                                        \
-	(cp)[4] = (byte) ((value) >> 32);                                        \
-	(cp)[3] = (byte) ((value) >> 24);                                        \
-	(cp)[2] = (byte) ((value) >> 16);                                        \
-	(cp)[1] = (byte) ((value) >> 8);                                         \
-	(cp)[0] = (byte) (value); } while (0)
-
-#define PUT_32BIT_LE(cp, value) do {                                    \
-	(cp)[3] = (byte) ((value) >> 24);                                        \
-	(cp)[2] = (byte) ((value) >> 16);                                        \
-	(cp)[1] = (byte) ((value) >> 8);                                         \
-	(cp)[0] = (byte) (value); } while (0)
-
-#ifndef TC_MINIMIZE_CODE_SIZE
-
-static byte PADDING[64] = {
-	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-#else
-
-static byte PADDING[64];
-
-#endif
-
-void RMD160Init (RMD160_CTX *ctx)
-{
-	ctx->count = 0;
-	ctx->state[0] = 0x67452301;
-	ctx->state[1] = 0xefcdab89;
-	ctx->state[2] = 0x98badcfe;
-	ctx->state[3] = 0x10325476;
-	ctx->state[4] = 0xc3d2e1f0;
-	PADDING[0] = 0x80;
-}
-
-/*
-* Update context to reflect the concatenation of another buffer full
-* of bytes.
-*/
-void RMD160Update (RMD160_CTX *ctx, const unsigned char *input, unsigned __int32 lenArg)
-{
-#ifndef TC_WINDOWS_BOOT
-	uint64 len = lenArg;
-#else
-	uint32 len = lenArg;
-#endif
-	unsigned int have, need;
-
-	/* Check how many bytes we already have and how many more we need. */
-	have = (unsigned int) ((ctx->count) & (RIPEMD160_BLOCK_LENGTH - 1));
-	need = RIPEMD160_BLOCK_LENGTH - have;
-
-	/* Update bitcount */
-	ctx->count += len;
-
-	if (len >= need) {
-		if (have != 0) {
-			memcpy (ctx->buffer + have, input, (size_t) need);
-			RMD160Transform ((uint32 *) ctx->state, (const uint32 *) ctx->buffer);
-			input += need;
-			len -= need;
-			have = 0;
-		}
-
-		/* Process data in RIPEMD160_BLOCK_LENGTH-byte chunks. */
-		while (len >= RIPEMD160_BLOCK_LENGTH) {
-			RMD160Transform ((uint32 *) ctx->state, (const uint32 *) input);
-			input += RIPEMD160_BLOCK_LENGTH;
-			len -= RIPEMD160_BLOCK_LENGTH;
-		}
-	}
-
-	/* Handle any remaining bytes of data. */
-	if (len != 0)
-		memcpy (ctx->buffer + have, input, (size_t) len);
-}
-
-/*
-* Pad pad to 64-byte boundary with the bit pattern
-* 1 0* (64-bit count of bits processed, MSB-first)
-*/
-static void RMD160Pad(RMD160_CTX *ctx)
-{
-	byte count[8];
-	uint32 padlen;
-
-	/* Convert count to 8 bytes in little endian order. */
-
-#ifndef TC_WINDOWS_BOOT
-	uint64 bitcount = ctx->count << 3;
-	PUT_64BIT_LE(count, bitcount);
-#else
-	*(uint32 *) (count + 4) = 0;
-	*(uint32 *) (count + 0) = ctx->count << 3;
-#endif
-
-	/* Pad out to 56 mod 64. */
-	padlen = RIPEMD160_BLOCK_LENGTH -
-		(uint32)((ctx->count) & (RIPEMD160_BLOCK_LENGTH - 1));
-	if (padlen < 1 + 8)
-		padlen += RIPEMD160_BLOCK_LENGTH;
-	RMD160Update(ctx, PADDING, padlen - 8);            /* padlen - 8 <= 64 */
-	RMD160Update(ctx, count, 8);
-}
-
-/*
-* Final wrapup--call RMD160Pad, fill in digest and zero out ctx.
-*/
-void RMD160Final(unsigned char *digest, RMD160_CTX *ctx)
-{
-	int i;
-
-	RMD160Pad(ctx);
-	if (digest) {
-		for (i = 0; i < 5; i++)
-			PUT_32BIT_LE(digest + i * 4, ctx->state[i]);
-#ifndef TC_WINDOWS_BOOT
-		burn (ctx, sizeof(*ctx));
-#endif
-	}
-}
-
-
-#ifndef TC_MINIMIZE_CODE_SIZE
-
-#define word32 unsigned __int32
-
-#define k0 0
-#define k1 0x5a827999UL
-#define k2 0x6ed9eba1UL
-#define k3 0x8f1bbcdcUL
-#define k4 0xa953fd4eUL
-#define k5 0x50a28be6UL
-#define k6 0x5c4dd124UL
-#define k7 0x6d703ef3UL
-#define k8 0x7a6d76e9UL
-#define k9 0
-
-static word32 rotlFixed (word32 x, unsigned int y)
-{
-	return (word32)((x<<y) | (x>>(sizeof(word32)*8-y)));
-}
-
-#define Subround(f, a, b, c, d, e, x, s, k)        \
-	a += f(b, c, d) + x + k;\
-	a = rotlFixed((word32)a, s) + e;\
-	c = rotlFixed((word32)c, 10U)
-
-void RMD160Transform (unsigned __int32 *digest, const unsigned __int32 *data)
-{
-#if BYTE_ORDER == LITTLE_ENDIAN
-	const word32 *X = data;
-#else
-	word32 X[16];
-	int i;
-#endif
-
-	word32 a1, b1, c1, d1, e1, a2, b2, c2, d2, e2;
-	a1 = a2 = digest[0];
-	b1 = b2 = digest[1];
-	c1 = c2 = digest[2];
-	d1 = d2 = digest[3];
-	e1 = e2 = digest[4];
-
-#if BYTE_ORDER == BIG_ENDIAN
-	for (i = 0; i < 16; i++)
-	{
-		X[i] = LE32 (data[i]);
-	}
-#endif
-
-	Subround(F, a1, b1, c1, d1, e1, X[ 0], 11, k0);
-	Subround(F, e1, a1, b1, c1, d1, X[ 1], 14, k0);
-	Subround(F, d1, e1, a1, b1, c1, X[ 2], 15, k0);
-	Subround(F, c1, d1, e1, a1, b1, X[ 3], 12, k0);
-	Subround(F, b1, c1, d1, e1, a1, X[ 4],  5, k0);
-	Subround(F, a1, b1, c1, d1, e1, X[ 5],  8, k0);
-	Subround(F, e1, a1, b1, c1, d1, X[ 6],  7, k0);
-	Subround(F, d1, e1, a1, b1, c1, X[ 7],  9, k0);
-	Subround(F, c1, d1, e1, a1, b1, X[ 8], 11, k0);
-	Subround(F, b1, c1, d1, e1, a1, X[ 9], 13, k0);
-	Subround(F, a1, b1, c1, d1, e1, X[10], 14, k0);
-	Subround(F, e1, a1, b1, c1, d1, X[11], 15, k0);
-	Subround(F, d1, e1, a1, b1, c1, X[12],  6, k0);
-	Subround(F, c1, d1, e1, a1, b1, X[13],  7, k0);
-	Subround(F, b1, c1, d1, e1, a1, X[14],  9, k0);
-	Subround(F, a1, b1, c1, d1, e1, X[15],  8, k0);
-
-	Subround(G, e1, a1, b1, c1, d1, X[ 7],  7, k1);
-	Subround(G, d1, e1, a1, b1, c1, X[ 4],  6, k1);
-	Subround(G, c1, d1, e1, a1, b1, X[13],  8, k1);
-	Subround(G, b1, c1, d1, e1, a1, X[ 1], 13, k1);
-	Subround(G, a1, b1, c1, d1, e1, X[10], 11, k1);
-	Subround(G, e1, a1, b1, c1, d1, X[ 6],  9, k1);
-	Subround(G, d1, e1, a1, b1, c1, X[15],  7, k1);
-	Subround(G, c1, d1, e1, a1, b1, X[ 3], 15, k1);
-	Subround(G, b1, c1, d1, e1, a1, X[12],  7, k1);
-	Subround(G, a1, b1, c1, d1, e1, X[ 0], 12, k1);
-	Subround(G, e1, a1, b1, c1, d1, X[ 9], 15, k1);
-	Subround(G, d1, e1, a1, b1, c1, X[ 5],  9, k1);
-	Subround(G, c1, d1, e1, a1, b1, X[ 2], 11, k1);
-	Subround(G, b1, c1, d1, e1, a1, X[14],  7, k1);
-	Subround(G, a1, b1, c1, d1, e1, X[11], 13, k1);
-	Subround(G, e1, a1, b1, c1, d1, X[ 8], 12, k1);
-
-	Subround(H, d1, e1, a1, b1, c1, X[ 3], 11, k2);
-	Subround(H, c1, d1, e1, a1, b1, X[10], 13, k2);
-	Subround(H, b1, c1, d1, e1, a1, X[14],  6, k2);
-	Subround(H, a1, b1, c1, d1, e1, X[ 4],  7, k2);
-	Subround(H, e1, a1, b1, c1, d1, X[ 9], 14, k2);
-	Subround(H, d1, e1, a1, b1, c1, X[15],  9, k2);
-	Subround(H, c1, d1, e1, a1, b1, X[ 8], 13, k2);
-	Subround(H, b1, c1, d1, e1, a1, X[ 1], 15, k2);
-	Subround(H, a1, b1, c1, d1, e1, X[ 2], 14, k2);
-	Subround(H, e1, a1, b1, c1, d1, X[ 7],  8, k2);
-	Subround(H, d1, e1, a1, b1, c1, X[ 0], 13, k2);
-	Subround(H, c1, d1, e1, a1, b1, X[ 6],  6, k2);
-	Subround(H, b1, c1, d1, e1, a1, X[13],  5, k2);
-	Subround(H, a1, b1, c1, d1, e1, X[11], 12, k2);
-	Subround(H, e1, a1, b1, c1, d1, X[ 5],  7, k2);
-	Subround(H, d1, e1, a1, b1, c1, X[12],  5, k2);
-
-	Subround(I, c1, d1, e1, a1, b1, X[ 1], 11, k3);
-	Subround(I, b1, c1, d1, e1, a1, X[ 9], 12, k3);
-	Subround(I, a1, b1, c1, d1, e1, X[11], 14, k3);
-	Subround(I, e1, a1, b1, c1, d1, X[10], 15, k3);
-	Subround(I, d1, e1, a1, b1, c1, X[ 0], 14, k3);
-	Subround(I, c1, d1, e1, a1, b1, X[ 8], 15, k3);
-	Subround(I, b1, c1, d1, e1, a1, X[12],  9, k3);
-	Subround(I, a1, b1, c1, d1, e1, X[ 4],  8, k3);
-	Subround(I, e1, a1, b1, c1, d1, X[13],  9, k3);
-	Subround(I, d1, e1, a1, b1, c1, X[ 3], 14, k3);
-	Subround(I, c1, d1, e1, a1, b1, X[ 7],  5, k3);
-	Subround(I, b1, c1, d1, e1, a1, X[15],  6, k3);
-	Subround(I, a1, b1, c1, d1, e1, X[14],  8, k3);
-	Subround(I, e1, a1, b1, c1, d1, X[ 5],  6, k3);
-	Subround(I, d1, e1, a1, b1, c1, X[ 6],  5, k3);
-	Subround(I, c1, d1, e1, a1, b1, X[ 2], 12, k3);
-
-	Subround(J, b1, c1, d1, e1, a1, X[ 4],  9, k4);
-	Subround(J, a1, b1, c1, d1, e1, X[ 0], 15, k4);
-	Subround(J, e1, a1, b1, c1, d1, X[ 5],  5, k4);
-	Subround(J, d1, e1, a1, b1, c1, X[ 9], 11, k4);
-	Subround(J, c1, d1, e1, a1, b1, X[ 7],  6, k4);
-	Subround(J, b1, c1, d1, e1, a1, X[12],  8, k4);
-	Subround(J, a1, b1, c1, d1, e1, X[ 2], 13, k4);
-	Subround(J, e1, a1, b1, c1, d1, X[10], 12, k4);
-	Subround(J, d1, e1, a1, b1, c1, X[14],  5, k4);
-	Subround(J, c1, d1, e1, a1, b1, X[ 1], 12, k4);
-	Subround(J, b1, c1, d1, e1, a1, X[ 3], 13, k4);
-	Subround(J, a1, b1, c1, d1, e1, X[ 8], 14, k4);
-	Subround(J, e1, a1, b1, c1, d1, X[11], 11, k4);
-	Subround(J, d1, e1, a1, b1, c1, X[ 6],  8, k4);
-	Subround(J, c1, d1, e1, a1, b1, X[15],  5, k4);
-	Subround(J, b1, c1, d1, e1, a1, X[13],  6, k4);
-
-	Subround(J, a2, b2, c2, d2, e2, X[ 5],  8, k5);
-	Subround(J, e2, a2, b2, c2, d2, X[14],  9, k5);
-	Subround(J, d2, e2, a2, b2, c2, X[ 7],  9, k5);
-	Subround(J, c2, d2, e2, a2, b2, X[ 0], 11, k5);
-	Subround(J, b2, c2, d2, e2, a2, X[ 9], 13, k5);
-	Subround(J, a2, b2, c2, d2, e2, X[ 2], 15, k5);
-	Subround(J, e2, a2, b2, c2, d2, X[11], 15, k5);
-	Subround(J, d2, e2, a2, b2, c2, X[ 4],  5, k5);
-	Subround(J, c2, d2, e2, a2, b2, X[13],  7, k5);
-	Subround(J, b2, c2, d2, e2, a2, X[ 6],  7, k5);
-	Subround(J, a2, b2, c2, d2, e2, X[15],  8, k5);
-	Subround(J, e2, a2, b2, c2, d2, X[ 8], 11, k5);
-	Subround(J, d2, e2, a2, b2, c2, X[ 1], 14, k5);
-	Subround(J, c2, d2, e2, a2, b2, X[10], 14, k5);
-	Subround(J, b2, c2, d2, e2, a2, X[ 3], 12, k5);
-	Subround(J, a2, b2, c2, d2, e2, X[12],  6, k5);
-
-	Subround(I, e2, a2, b2, c2, d2, X[ 6],  9, k6); 
-	Subround(I, d2, e2, a2, b2, c2, X[11], 13, k6);
-	Subround(I, c2, d2, e2, a2, b2, X[ 3], 15, k6);
-	Subround(I, b2, c2, d2, e2, a2, X[ 7],  7, k6);
-	Subround(I, a2, b2, c2, d2, e2, X[ 0], 12, k6);
-	Subround(I, e2, a2, b2, c2, d2, X[13],  8, k6);
-	Subround(I, d2, e2, a2, b2, c2, X[ 5],  9, k6);
-	Subround(I, c2, d2, e2, a2, b2, X[10], 11, k6);
-	Subround(I, b2, c2, d2, e2, a2, X[14],  7, k6);
-	Subround(I, a2, b2, c2, d2, e2, X[15],  7, k6);
-	Subround(I, e2, a2, b2, c2, d2, X[ 8], 12, k6);
-	Subround(I, d2, e2, a2, b2, c2, X[12],  7, k6);
-	Subround(I, c2, d2, e2, a2, b2, X[ 4],  6, k6);
-	Subround(I, b2, c2, d2, e2, a2, X[ 9], 15, k6);
-	Subround(I, a2, b2, c2, d2, e2, X[ 1], 13, k6);
-	Subround(I, e2, a2, b2, c2, d2, X[ 2], 11, k6);
-
-	Subround(H, d2, e2, a2, b2, c2, X[15],  9, k7);
-	Subround(H, c2, d2, e2, a2, b2, X[ 5],  7, k7);
-	Subround(H, b2, c2, d2, e2, a2, X[ 1], 15, k7);
-	Subround(H, a2, b2, c2, d2, e2, X[ 3], 11, k7);
-	Subround(H, e2, a2, b2, c2, d2, X[ 7],  8, k7);
-	Subround(H, d2, e2, a2, b2, c2, X[14],  6, k7);
-	Subround(H, c2, d2, e2, a2, b2, X[ 6],  6, k7);
-	Subround(H, b2, c2, d2, e2, a2, X[ 9], 14, k7);
-	Subround(H, a2, b2, c2, d2, e2, X[11], 12, k7);
-	Subround(H, e2, a2, b2, c2, d2, X[ 8], 13, k7);
-	Subround(H, d2, e2, a2, b2, c2, X[12],  5, k7);
-	Subround(H, c2, d2, e2, a2, b2, X[ 2], 14, k7);
-	Subround(H, b2, c2, d2, e2, a2, X[10], 13, k7);
-	Subround(H, a2, b2, c2, d2, e2, X[ 0], 13, k7);
-	Subround(H, e2, a2, b2, c2, d2, X[ 4],  7, k7);
-	Subround(H, d2, e2, a2, b2, c2, X[13],  5, k7);
-
-	Subround(G, c2, d2, e2, a2, b2, X[ 8], 15, k8);
-	Subround(G, b2, c2, d2, e2, a2, X[ 6],  5, k8);
-	Subround(G, a2, b2, c2, d2, e2, X[ 4],  8, k8);
-	Subround(G, e2, a2, b2, c2, d2, X[ 1], 11, k8);
-	Subround(G, d2, e2, a2, b2, c2, X[ 3], 14, k8);
-	Subround(G, c2, d2, e2, a2, b2, X[11], 14, k8);
-	Subround(G, b2, c2, d2, e2, a2, X[15],  6, k8);
-	Subround(G, a2, b2, c2, d2, e2, X[ 0], 14, k8);
-	Subround(G, e2, a2, b2, c2, d2, X[ 5],  6, k8);
-	Subround(G, d2, e2, a2, b2, c2, X[12],  9, k8);
-	Subround(G, c2, d2, e2, a2, b2, X[ 2], 12, k8);
-	Subround(G, b2, c2, d2, e2, a2, X[13],  9, k8);
-	Subround(G, a2, b2, c2, d2, e2, X[ 9], 12, k8);
-	Subround(G, e2, a2, b2, c2, d2, X[ 7],  5, k8);
-	Subround(G, d2, e2, a2, b2, c2, X[10], 15, k8);
-	Subround(G, c2, d2, e2, a2, b2, X[14],  8, k8);
-
-	Subround(F, b2, c2, d2, e2, a2, X[12],  8, k9);
-	Subround(F, a2, b2, c2, d2, e2, X[15],  5, k9);
-	Subround(F, e2, a2, b2, c2, d2, X[10], 12, k9);
-	Subround(F, d2, e2, a2, b2, c2, X[ 4],  9, k9);
-	Subround(F, c2, d2, e2, a2, b2, X[ 1], 12, k9);
-	Subround(F, b2, c2, d2, e2, a2, X[ 5],  5, k9);
-	Subround(F, a2, b2, c2, d2, e2, X[ 8], 14, k9);
-	Subround(F, e2, a2, b2, c2, d2, X[ 7],  6, k9);
-	Subround(F, d2, e2, a2, b2, c2, X[ 6],  8, k9);
-	Subround(F, c2, d2, e2, a2, b2, X[ 2], 13, k9);
-	Subround(F, b2, c2, d2, e2, a2, X[13],  6, k9);
-	Subround(F, a2, b2, c2, d2, e2, X[14],  5, k9);
-	Subround(F, e2, a2, b2, c2, d2, X[ 0], 15, k9);
-	Subround(F, d2, e2, a2, b2, c2, X[ 3], 13, k9);
-	Subround(F, c2, d2, e2, a2, b2, X[ 9], 11, k9);
-	Subround(F, b2, c2, d2, e2, a2, X[11], 11, k9);
-
-	c1        = digest[1] + c1 + d2;
-	digest[1] = digest[2] + d1 + e2;
-	digest[2] = digest[3] + e1 + a2;
-	digest[3] = digest[4] + a1 + b2;
-	digest[4] = digest[0] + b1 + c2;
-	digest[0] = c1;
-}
-
-#else // TC_MINIMIZE_CODE_SIZE
-
-/*
- Derived from source code of TrueCrypt 7.1a, which is
- Copyright (c) 2008-2012 TrueCrypt Developers Association and which is governed
- by the TrueCrypt License 3.0.
-
- Modifications and additions to the original source code (contained in this file) 
- and all other portions of this file are Copyright (c) 2013-2016 IDRIX
- and are governed by the Apache License 2.0 the full text of which is
- contained in the file License.txt included in VeraCrypt binary and source
- code distribution packages.
-*/
-
-#pragma optimize ("tl", on)
-
-typedef unsigned __int32 uint32;
-typedef unsigned __int8 byte;
-
-#include <stdlib.h>
-#pragma intrinsic (_lrotl)
-
-static const byte OrderTab[] = {
-	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-	7, 4, 13, 1, 10, 6, 15, 3, 12, 0, 9, 5, 2, 14, 11, 8,
-	3, 10, 14, 4, 9, 15, 8, 1, 2, 7, 0, 6, 13, 11, 5, 12,
-	1, 9, 11, 10, 0, 8, 12, 4, 13, 3, 7, 15, 14, 5, 6, 2,
-	4, 0, 5, 9, 7, 12, 2, 10, 14, 1, 3, 8, 11, 6, 15, 13,
-	5, 14, 7, 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12,
-	6, 11, 3, 7, 0, 13, 5, 10, 14, 15, 8, 12, 4, 9, 1, 2,
-	15, 5, 1, 3, 7, 14, 6, 9, 11, 8, 12, 2, 10, 0, 4, 13,
-	8, 6, 4, 1, 3, 11, 15, 0, 5, 12, 2, 13, 9, 7, 10, 14,
-	12, 15, 10, 4, 1, 5, 8, 7, 6, 2, 13, 14, 0, 3, 9, 11
-};
-
-static const byte RolTab[] = {
-	11, 14, 15, 12, 5, 8, 7, 9, 11, 13, 14, 15, 6, 7, 9, 8,
-	7, 6, 8, 13, 11, 9, 7, 15, 7, 12, 15, 9, 11, 7, 13, 12,
-	11, 13, 6, 7, 14, 9, 13, 15, 14, 8, 13, 6, 5, 12, 7, 5,
-	11, 12, 14, 15, 14, 15, 9, 8, 9, 14, 5, 6, 8, 6, 5, 12,
-	9, 15, 5, 11, 6, 8, 13, 12, 5, 12, 13, 14, 11, 8, 5, 6,
-	8, 9, 9, 11, 13, 15, 15, 5, 7, 7, 8, 11, 14, 14, 12, 6,
-	9, 13, 15, 7, 12, 8, 9, 11, 7, 7, 12, 7, 6, 15, 13, 11,
-	9, 7, 15, 11, 8, 6, 6, 14, 12, 13, 5, 14, 13, 13, 7, 5,
-	15, 5, 8, 11, 14, 14, 6, 14, 6, 9, 12, 9, 12, 5, 15, 8,
-	8, 5, 12, 9, 12, 5, 14, 6, 8, 13, 6, 5, 15, 13, 11, 11
-};
-
-static const uint32 KTab[] = {
-	0x00000000UL,
-	0x5A827999UL,
-	0x6ED9EBA1UL,
-	0x8F1BBCDCUL,
-	0xA953FD4EUL,
-	0x50A28BE6UL,
-	0x5C4DD124UL,
-	0x6D703EF3UL,
-	0x7A6D76E9UL,
-	0x00000000UL
-};
-
-
-void RMD160Transform (unsigned __int32 *state, const unsigned __int32 *data)
-{
-	uint32 a, b, c, d, e;
-	uint32 a2, b2, c2, d2, e2;
-	byte pos;
-	uint32 tmp;
-
-	a = state[0];
-	b = state[1];
-	c = state[2];
-	d = state[3];
-	e = state[4];
-
-	for (pos = 0; pos < 160; ++pos)
-	{
-		tmp = a + data[OrderTab[pos]] + KTab[pos >> 4];
-		
-		switch (pos >> 4)
-		{
-		case 0: case 9: tmp += F (b, c, d); break;
-		case 1: case 8: tmp += G (b, c, d); break;
-		case 2: case 7: tmp += H (b, c, d); break;
-		case 3: case 6: tmp += I (b, c, d); break;
-		case 4: case 5: tmp += J (b, c, d); break;
-		}
-
-		tmp = _lrotl (tmp, RolTab[pos]) + e;
-		a = e;
-		e = d;
-		d = _lrotl (c, 10);
-		c = b;
-		b = tmp;
-
-		if (pos == 79)
-		{
-			a2 = a;
-			b2 = b;
-			c2 = c;
-			d2 = d;
-			e2 = e;
-
-			a = state[0];
-			b = state[1];
-			c = state[2];
-			d = state[3];
-			e = state[4];
-		}
-	}
-
-	tmp = state[1] + c2 + d;
-	state[1] = state[2] + d2 + e;
-	state[2] = state[3] + e2 + a;
-	state[3] = state[4] + a2 + b;
-	state[4] = state[0] + b2 + c;
-	state[0] = tmp;
-}
-
-#endif // TC_MINIMIZE_CODE_SIZE
+// RIPEMD-160 written and placed in the public domain by Wei Dai
+
+/*
+ * This code implements the MD4 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ */
+
+/* Adapted for TrueCrypt */
+/* Adapted for VeraCrypt */
+
+#include <memory.h>
+#include "Common/Tcdefs.h"
+#include "Common/Endian.h"
+#include "Rmd160.h"
+
+#define F(x, y, z)    (x ^ y ^ z) 
+#define G(x, y, z)    (z ^ (x & (y^z)))
+#define H(x, y, z)    (z ^ (x | ~y))
+#define I(x, y, z)    (y ^ (z & (x^y)))
+#define J(x, y, z)    (x ^ (y | ~z))
+
+#define PUT_64BIT_LE(cp, value) do {                                    \
+	(cp)[7] = (byte) ((value) >> 56);                                        \
+	(cp)[6] = (byte) ((value) >> 48);                                        \
+	(cp)[5] = (byte) ((value) >> 40);                                        \
+	(cp)[4] = (byte) ((value) >> 32);                                        \
+	(cp)[3] = (byte) ((value) >> 24);                                        \
+	(cp)[2] = (byte) ((value) >> 16);                                        \
+	(cp)[1] = (byte) ((value) >> 8);                                         \
+	(cp)[0] = (byte) (value); } while (0)
+
+#define PUT_32BIT_LE(cp, value) do {                                    \
+	(cp)[3] = (byte) ((value) >> 24);                                        \
+	(cp)[2] = (byte) ((value) >> 16);                                        \
+	(cp)[1] = (byte) ((value) >> 8);                                         \
+	(cp)[0] = (byte) (value); } while (0)
+
+#ifndef TC_MINIMIZE_CODE_SIZE
+
+static byte PADDING[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+#else
+
+static byte PADDING[64];
+
+#endif
+
+void RMD160Init (RMD160_CTX *ctx)
+{
+	ctx->count = 0;
+	ctx->state[0] = 0x67452301;
+	ctx->state[1] = 0xefcdab89;
+	ctx->state[2] = 0x98badcfe;
+	ctx->state[3] = 0x10325476;
+	ctx->state[4] = 0xc3d2e1f0;
+	PADDING[0] = 0x80;
+}
+
+/*
+* Update context to reflect the concatenation of another buffer full
+* of bytes.
+*/
+void RMD160Update (RMD160_CTX *ctx, const unsigned char *input, unsigned __int32 lenArg)
+{
+#ifndef TC_WINDOWS_BOOT
+	uint64 len = lenArg;
+#else
+	uint32 len = lenArg;
+#endif
+	unsigned int have, need;
+
+	/* Check how many bytes we already have and how many more we need. */
+	have = (unsigned int) ((ctx->count) & (RIPEMD160_BLOCK_LENGTH - 1));
+	need = RIPEMD160_BLOCK_LENGTH - have;
+
+	/* Update bitcount */
+	ctx->count += len;
+
+	if (len >= need) {
+		if (have != 0) {
+			memcpy (ctx->buffer + have, input, (size_t) need);
+			RMD160Transform ((uint32 *) ctx->state, (const uint32 *) ctx->buffer);
+			input += need;
+			len -= need;
+			have = 0;
+		}
+
+		/* Process data in RIPEMD160_BLOCK_LENGTH-byte chunks. */
+		while (len >= RIPEMD160_BLOCK_LENGTH) {
+			RMD160Transform ((uint32 *) ctx->state, (const uint32 *) input);
+			input += RIPEMD160_BLOCK_LENGTH;
+			len -= RIPEMD160_BLOCK_LENGTH;
+		}
+	}
+
+	/* Handle any remaining bytes of data. */
+	if (len != 0)
+		memcpy (ctx->buffer + have, input, (size_t) len);
+}
+
+/*
+* Pad pad to 64-byte boundary with the bit pattern
+* 1 0* (64-bit count of bits processed, MSB-first)
+*/
+static void RMD160Pad(RMD160_CTX *ctx)
+{
+	byte count[8];
+	uint32 padlen;
+
+	/* Convert count to 8 bytes in little endian order. */
+
+#ifndef TC_WINDOWS_BOOT
+	uint64 bitcount = ctx->count << 3;
+	PUT_64BIT_LE(count, bitcount);
+#else
+	*(uint32 *) (count + 4) = 0;
+	*(uint32 *) (count + 0) = ctx->count << 3;
+#endif
+
+	/* Pad out to 56 mod 64. */
+	padlen = RIPEMD160_BLOCK_LENGTH -
+		(uint32)((ctx->count) & (RIPEMD160_BLOCK_LENGTH - 1));
+	if (padlen < 1 + 8)
+		padlen += RIPEMD160_BLOCK_LENGTH;
+	RMD160Update(ctx, PADDING, padlen - 8);            /* padlen - 8 <= 64 */
+	RMD160Update(ctx, count, 8);
+}
+
+/*
+* Final wrapup--call RMD160Pad, fill in digest and zero out ctx.
+*/
+void RMD160Final(unsigned char *digest, RMD160_CTX *ctx)
+{
+	int i;
+
+	RMD160Pad(ctx);
+	if (digest) {
+		for (i = 0; i < 5; i++)
+			PUT_32BIT_LE(digest + i * 4, ctx->state[i]);
+#ifndef TC_WINDOWS_BOOT
+		burn (ctx, sizeof(*ctx));
+#endif
+	}
+}
+
+
+#ifndef TC_MINIMIZE_CODE_SIZE
+
+#define word32 unsigned __int32
+
+#define k0 0
+#define k1 0x5a827999UL
+#define k2 0x6ed9eba1UL
+#define k3 0x8f1bbcdcUL
+#define k4 0xa953fd4eUL
+#define k5 0x50a28be6UL
+#define k6 0x5c4dd124UL
+#define k7 0x6d703ef3UL
+#define k8 0x7a6d76e9UL
+#define k9 0
+
+static word32 rotlFixed (word32 x, unsigned int y)
+{
+	return (word32)((x<<y) | (x>>(sizeof(word32)*8-y)));
+}
+
+#define Subround(f, a, b, c, d, e, x, s, k)        \
+	a += f(b, c, d) + x + k;\
+	a = rotlFixed((word32)a, s) + e;\
+	c = rotlFixed((word32)c, 10U)
+
+void RMD160Transform (unsigned __int32 *digest, const unsigned __int32 *data)
+{
+#if BYTE_ORDER == LITTLE_ENDIAN
+	const word32 *X = data;
+#else
+	word32 X[16];
+	int i;
+#endif
+
+	word32 a1, b1, c1, d1, e1, a2, b2, c2, d2, e2;
+	a1 = a2 = digest[0];
+	b1 = b2 = digest[1];
+	c1 = c2 = digest[2];
+	d1 = d2 = digest[3];
+	e1 = e2 = digest[4];
+
+#if BYTE_ORDER == BIG_ENDIAN
+	for (i = 0; i < 16; i++)
+	{
+		X[i] = LE32 (data[i]);
+	}
+#endif
+
+	Subround(F, a1, b1, c1, d1, e1, X[ 0], 11, k0);
+	Subround(F, e1, a1, b1, c1, d1, X[ 1], 14, k0);
+	Subround(F, d1, e1, a1, b1, c1, X[ 2], 15, k0);
+	Subround(F, c1, d1, e1, a1, b1, X[ 3], 12, k0);
+	Subround(F, b1, c1, d1, e1, a1, X[ 4],  5, k0);
+	Subround(F, a1, b1, c1, d1, e1, X[ 5],  8, k0);
+	Subround(F, e1, a1, b1, c1, d1, X[ 6],  7, k0);
+	Subround(F, d1, e1, a1, b1, c1, X[ 7],  9, k0);
+	Subround(F, c1, d1, e1, a1, b1, X[ 8], 11, k0);
+	Subround(F, b1, c1, d1, e1, a1, X[ 9], 13, k0);
+	Subround(F, a1, b1, c1, d1, e1, X[10], 14, k0);
+	Subround(F, e1, a1, b1, c1, d1, X[11], 15, k0);
+	Subround(F, d1, e1, a1, b1, c1, X[12],  6, k0);
+	Subround(F, c1, d1, e1, a1, b1, X[13],  7, k0);
+	Subround(F, b1, c1, d1, e1, a1, X[14],  9, k0);
+	Subround(F, a1, b1, c1, d1, e1, X[15],  8, k0);
+
+	Subround(G, e1, a1, b1, c1, d1, X[ 7],  7, k1);
+	Subround(G, d1, e1, a1, b1, c1, X[ 4],  6, k1);
+	Subround(G, c1, d1, e1, a1, b1, X[13],  8, k1);
+	Subround(G, b1, c1, d1, e1, a1, X[ 1], 13, k1);
+	Subround(G, a1, b1, c1, d1, e1, X[10], 11, k1);
+	Subround(G, e1, a1, b1, c1, d1, X[ 6],  9, k1);
+	Subround(G, d1, e1, a1, b1, c1, X[15],  7, k1);
+	Subround(G, c1, d1, e1, a1, b1, X[ 3], 15, k1);
+	Subround(G, b1, c1, d1, e1, a1, X[12],  7, k1);
+	Subround(G, a1, b1, c1, d1, e1, X[ 0], 12, k1);
+	Subround(G, e1, a1, b1, c1, d1, X[ 9], 15, k1);
+	Subround(G, d1, e1, a1, b1, c1, X[ 5],  9, k1);
+	Subround(G, c1, d1, e1, a1, b1, X[ 2], 11, k1);
+	Subround(G, b1, c1, d1, e1, a1, X[14],  7, k1);
+	Subround(G, a1, b1, c1, d1, e1, X[11], 13, k1);
+	Subround(G, e1, a1, b1, c1, d1, X[ 8], 12, k1);
+
+	Subround(H, d1, e1, a1, b1, c1, X[ 3], 11, k2);
+	Subround(H, c1, d1, e1, a1, b1, X[10], 13, k2);
+	Subround(H, b1, c1, d1, e1, a1, X[14],  6, k2);
+	Subround(H, a1, b1, c1, d1, e1, X[ 4],  7, k2);
+	Subround(H, e1, a1, b1, c1, d1, X[ 9], 14, k2);
+	Subround(H, d1, e1, a1, b1, c1, X[15],  9, k2);
+	Subround(H, c1, d1, e1, a1, b1, X[ 8], 13, k2);
+	Subround(H, b1, c1, d1, e1, a1, X[ 1], 15, k2);
+	Subround(H, a1, b1, c1, d1, e1, X[ 2], 14, k2);
+	Subround(H, e1, a1, b1, c1, d1, X[ 7],  8, k2);
+	Subround(H, d1, e1, a1, b1, c1, X[ 0], 13, k2);
+	Subround(H, c1, d1, e1, a1, b1, X[ 6],  6, k2);
+	Subround(H, b1, c1, d1, e1, a1, X[13],  5, k2);
+	Subround(H, a1, b1, c1, d1, e1, X[11], 12, k2);
+	Subround(H, e1, a1, b1, c1, d1, X[ 5],  7, k2);
+	Subround(H, d1, e1, a1, b1, c1, X[12],  5, k2);
+
+	Subround(I, c1, d1, e1, a1, b1, X[ 1], 11, k3);
+	Subround(I, b1, c1, d1, e1, a1, X[ 9], 12, k3);
+	Subround(I, a1, b1, c1, d1, e1, X[11], 14, k3);
+	Subround(I, e1, a1, b1, c1, d1, X[10], 15, k3);
+	Subround(I, d1, e1, a1, b1, c1, X[ 0], 14, k3);
+	Subround(I, c1, d1, e1, a1, b1, X[ 8], 15, k3);
+	Subround(I, b1, c1, d1, e1, a1, X[12],  9, k3);
+	Subround(I, a1, b1, c1, d1, e1, X[ 4],  8, k3);
+	Subround(I, e1, a1, b1, c1, d1, X[13],  9, k3);
+	Subround(I, d1, e1, a1, b1, c1, X[ 3], 14, k3);
+	Subround(I, c1, d1, e1, a1, b1, X[ 7],  5, k3);
+	Subround(I, b1, c1, d1, e1, a1, X[15],  6, k3);
+	Subround(I, a1, b1, c1, d1, e1, X[14],  8, k3);
+	Subround(I, e1, a1, b1, c1, d1, X[ 5],  6, k3);
+	Subround(I, d1, e1, a1, b1, c1, X[ 6],  5, k3);
+	Subround(I, c1, d1, e1, a1, b1, X[ 2], 12, k3);
+
+	Subround(J, b1, c1, d1, e1, a1, X[ 4],  9, k4);
+	Subround(J, a1, b1, c1, d1, e1, X[ 0], 15, k4);
+	Subround(J, e1, a1, b1, c1, d1, X[ 5],  5, k4);
+	Subround(J, d1, e1, a1, b1, c1, X[ 9], 11, k4);
+	Subround(J, c1, d1, e1, a1, b1, X[ 7],  6, k4);
+	Subround(J, b1, c1, d1, e1, a1, X[12],  8, k4);
+	Subround(J, a1, b1, c1, d1, e1, X[ 2], 13, k4);
+	Subround(J, e1, a1, b1, c1, d1, X[10], 12, k4);
+	Subround(J, d1, e1, a1, b1, c1, X[14],  5, k4);
+	Subround(J, c1, d1, e1, a1, b1, X[ 1], 12, k4);
+	Subround(J, b1, c1, d1, e1, a1, X[ 3], 13, k4);
+	Subround(J, a1, b1, c1, d1, e1, X[ 8], 14, k4);
+	Subround(J, e1, a1, b1, c1, d1, X[11], 11, k4);
+	Subround(J, d1, e1, a1, b1, c1, X[ 6],  8, k4);
+	Subround(J, c1, d1, e1, a1, b1, X[15],  5, k4);
+	Subround(J, b1, c1, d1, e1, a1, X[13],  6, k4);
+
+	Subround(J, a2, b2, c2, d2, e2, X[ 5],  8, k5);
+	Subround(J, e2, a2, b2, c2, d2, X[14],  9, k5);
+	Subround(J, d2, e2, a2, b2, c2, X[ 7],  9, k5);
+	Subround(J, c2, d2, e2, a2, b2, X[ 0], 11, k5);
+	Subround(J, b2, c2, d2, e2, a2, X[ 9], 13, k5);
+	Subround(J, a2, b2, c2, d2, e2, X[ 2], 15, k5);
+	Subround(J, e2, a2, b2, c2, d2, X[11], 15, k5);
+	Subround(J, d2, e2, a2, b2, c2, X[ 4],  5, k5);
+	Subround(J, c2, d2, e2, a2, b2, X[13],  7, k5);
+	Subround(J, b2, c2, d2, e2, a2, X[ 6],  7, k5);
+	Subround(J, a2, b2, c2, d2, e2, X[15],  8, k5);
+	Subround(J, e2, a2, b2, c2, d2, X[ 8], 11, k5);
+	Subround(J, d2, e2, a2, b2, c2, X[ 1], 14, k5);
+	Subround(J, c2, d2, e2, a2, b2, X[10], 14, k5);
+	Subround(J, b2, c2, d2, e2, a2, X[ 3], 12, k5);
+	Subround(J, a2, b2, c2, d2, e2, X[12],  6, k5);
+
+	Subround(I, e2, a2, b2, c2, d2, X[ 6],  9, k6); 
+	Subround(I, d2, e2, a2, b2, c2, X[11], 13, k6);
+	Subround(I, c2, d2, e2, a2, b2, X[ 3], 15, k6);
+	Subround(I, b2, c2, d2, e2, a2, X[ 7],  7, k6);
+	Subround(I, a2, b2, c2, d2, e2, X[ 0], 12, k6);
+	Subround(I, e2, a2, b2, c2, d2, X[13],  8, k6);
+	Subround(I, d2, e2, a2, b2, c2, X[ 5],  9, k6);
+	Subround(I, c2, d2, e2, a2, b2, X[10], 11, k6);
+	Subround(I, b2, c2, d2, e2, a2, X[14],  7, k6);
+	Subround(I, a2, b2, c2, d2, e2, X[15],  7, k6);
+	Subround(I, e2, a2, b2, c2, d2, X[ 8], 12, k6);
+	Subround(I, d2, e2, a2, b2, c2, X[12],  7, k6);
+	Subround(I, c2, d2, e2, a2, b2, X[ 4],  6, k6);
+	Subround(I, b2, c2, d2, e2, a2, X[ 9], 15, k6);
+	Subround(I, a2, b2, c2, d2, e2, X[ 1], 13, k6);
+	Subround(I, e2, a2, b2, c2, d2, X[ 2], 11, k6);
+
+	Subround(H, d2, e2, a2, b2, c2, X[15],  9, k7);
+	Subround(H, c2, d2, e2, a2, b2, X[ 5],  7, k7);
+	Subround(H, b2, c2, d2, e2, a2, X[ 1], 15, k7);
+	Subround(H, a2, b2, c2, d2, e2, X[ 3], 11, k7);
+	Subround(H, e2, a2, b2, c2, d2, X[ 7],  8, k7);
+	Subround(H, d2, e2, a2, b2, c2, X[14],  6, k7);
+	Subround(H, c2, d2, e2, a2, b2, X[ 6],  6, k7);
+	Subround(H, b2, c2, d2, e2, a2, X[ 9], 14, k7);
+	Subround(H, a2, b2, c2, d2, e2, X[11], 12, k7);
+	Subround(H, e2, a2, b2, c2, d2, X[ 8], 13, k7);
+	Subround(H, d2, e2, a2, b2, c2, X[12],  5, k7);
+	Subround(H, c2, d2, e2, a2, b2, X[ 2], 14, k7);
+	Subround(H, b2, c2, d2, e2, a2, X[10], 13, k7);
+	Subround(H, a2, b2, c2, d2, e2, X[ 0], 13, k7);
+	Subround(H, e2, a2, b2, c2, d2, X[ 4],  7, k7);
+	Subround(H, d2, e2, a2, b2, c2, X[13],  5, k7);
+
+	Subround(G, c2, d2, e2, a2, b2, X[ 8], 15, k8);
+	Subround(G, b2, c2, d2, e2, a2, X[ 6],  5, k8);
+	Subround(G, a2, b2, c2, d2, e2, X[ 4],  8, k8);
+	Subround(G, e2, a2, b2, c2, d2, X[ 1], 11, k8);
+	Subround(G, d2, e2, a2, b2, c2, X[ 3], 14, k8);
+	Subround(G, c2, d2, e2, a2, b2, X[11], 14, k8);
+	Subround(G, b2, c2, d2, e2, a2, X[15],  6, k8);
+	Subround(G, a2, b2, c2, d2, e2, X[ 0], 14, k8);
+	Subround(G, e2, a2, b2, c2, d2, X[ 5],  6, k8);
+	Subround(G, d2, e2, a2, b2, c2, X[12],  9, k8);
+	Subround(G, c2, d2, e2, a2, b2, X[ 2], 12, k8);
+	Subround(G, b2, c2, d2, e2, a2, X[13],  9, k8);
+	Subround(G, a2, b2, c2, d2, e2, X[ 9], 12, k8);
+	Subround(G, e2, a2, b2, c2, d2, X[ 7],  5, k8);
+	Subround(G, d2, e2, a2, b2, c2, X[10], 15, k8);
+	Subround(G, c2, d2, e2, a2, b2, X[14],  8, k8);
+
+	Subround(F, b2, c2, d2, e2, a2, X[12],  8, k9);
+	Subround(F, a2, b2, c2, d2, e2, X[15],  5, k9);
+	Subround(F, e2, a2, b2, c2, d2, X[10], 12, k9);
+	Subround(F, d2, e2, a2, b2, c2, X[ 4],  9, k9);
+	Subround(F, c2, d2, e2, a2, b2, X[ 1], 12, k9);
+	Subround(F, b2, c2, d2, e2, a2, X[ 5],  5, k9);
+	Subround(F, a2, b2, c2, d2, e2, X[ 8], 14, k9);
+	Subround(F, e2, a2, b2, c2, d2, X[ 7],  6, k9);
+	Subround(F, d2, e2, a2, b2, c2, X[ 6],  8, k9);
+	Subround(F, c2, d2, e2, a2, b2, X[ 2], 13, k9);
+	Subround(F, b2, c2, d2, e2, a2, X[13],  6, k9);
+	Subround(F, a2, b2, c2, d2, e2, X[14],  5, k9);
+	Subround(F, e2, a2, b2, c2, d2, X[ 0], 15, k9);
+	Subround(F, d2, e2, a2, b2, c2, X[ 3], 13, k9);
+	Subround(F, c2, d2, e2, a2, b2, X[ 9], 11, k9);
+	Subround(F, b2, c2, d2, e2, a2, X[11], 11, k9);
+
+	c1        = digest[1] + c1 + d2;
+	digest[1] = digest[2] + d1 + e2;
+	digest[2] = digest[3] + e1 + a2;
+	digest[3] = digest[4] + a1 + b2;
+	digest[4] = digest[0] + b1 + c2;
+	digest[0] = c1;
+}
+
+#else // TC_MINIMIZE_CODE_SIZE
+
+/*
+ Derived from source code of TrueCrypt 7.1a, which is
+ Copyright (c) 2008-2012 TrueCrypt Developers Association and which is governed
+ by the TrueCrypt License 3.0.
+
+ Modifications and additions to the original source code (contained in this file) 
+ and all other portions of this file are Copyright (c) 2013-2016 IDRIX
+ and are governed by the Apache License 2.0 the full text of which is
+ contained in the file License.txt included in VeraCrypt binary and source
+ code distribution packages.
+*/
+
+#pragma optimize ("tl", on)
+
+typedef unsigned __int32 uint32;
+typedef unsigned __int8 byte;
+
+#include <stdlib.h>
+#pragma intrinsic (_lrotl)
+
+static const byte OrderTab[] = {
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	7, 4, 13, 1, 10, 6, 15, 3, 12, 0, 9, 5, 2, 14, 11, 8,
+	3, 10, 14, 4, 9, 15, 8, 1, 2, 7, 0, 6, 13, 11, 5, 12,
+	1, 9, 11, 10, 0, 8, 12, 4, 13, 3, 7, 15, 14, 5, 6, 2,
+	4, 0, 5, 9, 7, 12, 2, 10, 14, 1, 3, 8, 11, 6, 15, 13,
+	5, 14, 7, 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12,
+	6, 11, 3, 7, 0, 13, 5, 10, 14, 15, 8, 12, 4, 9, 1, 2,
+	15, 5, 1, 3, 7, 14, 6, 9, 11, 8, 12, 2, 10, 0, 4, 13,
+	8, 6, 4, 1, 3, 11, 15, 0, 5, 12, 2, 13, 9, 7, 10, 14,
+	12, 15, 10, 4, 1, 5, 8, 7, 6, 2, 13, 14, 0, 3, 9, 11
+};
+
+static const byte RolTab[] = {
+	11, 14, 15, 12, 5, 8, 7, 9, 11, 13, 14, 15, 6, 7, 9, 8,
+	7, 6, 8, 13, 11, 9, 7, 15, 7, 12, 15, 9, 11, 7, 13, 12,
+	11, 13, 6, 7, 14, 9, 13, 15, 14, 8, 13, 6, 5, 12, 7, 5,
+	11, 12, 14, 15, 14, 15, 9, 8, 9, 14, 5, 6, 8, 6, 5, 12,
+	9, 15, 5, 11, 6, 8, 13, 12, 5, 12, 13, 14, 11, 8, 5, 6,
+	8, 9, 9, 11, 13, 15, 15, 5, 7, 7, 8, 11, 14, 14, 12, 6,
+	9, 13, 15, 7, 12, 8, 9, 11, 7, 7, 12, 7, 6, 15, 13, 11,
+	9, 7, 15, 11, 8, 6, 6, 14, 12, 13, 5, 14, 13, 13, 7, 5,
+	15, 5, 8, 11, 14, 14, 6, 14, 6, 9, 12, 9, 12, 5, 15, 8,
+	8, 5, 12, 9, 12, 5, 14, 6, 8, 13, 6, 5, 15, 13, 11, 11
+};
+
+static const uint32 KTab[] = {
+	0x00000000UL,
+	0x5A827999UL,
+	0x6ED9EBA1UL,
+	0x8F1BBCDCUL,
+	0xA953FD4EUL,
+	0x50A28BE6UL,
+	0x5C4DD124UL,
+	0x6D703EF3UL,
+	0x7A6D76E9UL,
+	0x00000000UL
+};
+
+
+void RMD160Transform (unsigned __int32 *state, const unsigned __int32 *data)
+{
+	uint32 a, b, c, d, e;
+	uint32 a2, b2, c2, d2, e2;
+	byte pos;
+	uint32 tmp;
+
+	a = state[0];
+	b = state[1];
+	c = state[2];
+	d = state[3];
+	e = state[4];
+
+	for (pos = 0; pos < 160; ++pos)
+	{
+		tmp = a + data[OrderTab[pos]] + KTab[pos >> 4];
+		
+		switch (pos >> 4)
+		{
+		case 0: case 9: tmp += F (b, c, d); break;
+		case 1: case 8: tmp += G (b, c, d); break;
+		case 2: case 7: tmp += H (b, c, d); break;
+		case 3: case 6: tmp += I (b, c, d); break;
+		case 4: case 5: tmp += J (b, c, d); break;
+		}
+
+		tmp = _lrotl (tmp, RolTab[pos]) + e;
+		a = e;
+		e = d;
+		d = _lrotl (c, 10);
+		c = b;
+		b = tmp;
+
+		if (pos == 79)
+		{
+			a2 = a;
+			b2 = b;
+			c2 = c;
+			d2 = d;
+			e2 = e;
+
+			a = state[0];
+			b = state[1];
+			c = state[2];
+			d = state[3];
+			e = state[4];
+		}
+	}
+
+	tmp = state[1] + c2 + d;
+	state[1] = state[2] + d2 + e;
+	state[2] = state[3] + e2 + a;
+	state[3] = state[4] + a2 + b;
+	state[4] = state[0] + b2 + c;
+	state[0] = tmp;
+}
+
+#endif // TC_MINIMIZE_CODE_SIZE
diff --git a/src/Crypto/Rmd160.h b/src/Crypto/Rmd160.h
index 4dfa38f1..81b5d6f0 100644
--- a/src/Crypto/Rmd160.h
+++ b/src/Crypto/Rmd160.h
@@ -1,33 +1,33 @@
-#ifndef TC_HEADER_Crypto_Ripemd160
-#define TC_HEADER_Crypto_Ripemd160
-
-#include "Common/Tcdefs.h"
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-#define RIPEMD160_BLOCK_LENGTH 64
-
-typedef struct RMD160Context
-{
-	unsigned __int32 state[5];
-#ifndef TC_WINDOWS_BOOT
-	uint64 count;
-#else
-	uint32 count;
-#endif
-	unsigned char buffer[RIPEMD160_BLOCK_LENGTH];
-} RMD160_CTX;
-
-void RMD160Init (RMD160_CTX *ctx);
-void RMD160Transform (unsigned __int32 *state, const unsigned __int32 *data);
-void RMD160Update (RMD160_CTX *ctx, const unsigned char *input, unsigned __int32 len);
-void RMD160Final (unsigned char *digest, RMD160_CTX *ctx);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif // TC_HEADER_Crypto_Ripemd160
+#ifndef TC_HEADER_Crypto_Ripemd160
+#define TC_HEADER_Crypto_Ripemd160
+
+#include "Common/Tcdefs.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#define RIPEMD160_BLOCK_LENGTH 64
+
+typedef struct RMD160Context
+{
+	unsigned __int32 state[5];
+#ifndef TC_WINDOWS_BOOT
+	uint64 count;
+#else
+	uint32 count;
+#endif
+	unsigned char buffer[RIPEMD160_BLOCK_LENGTH];
+} RMD160_CTX;
+
+void RMD160Init (RMD160_CTX *ctx);
+void RMD160Transform (unsigned __int32 *state, const unsigned __int32 *data);
+void RMD160Update (RMD160_CTX *ctx, const unsigned char *input, unsigned __int32 len);
+void RMD160Final (unsigned char *digest, RMD160_CTX *ctx);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // TC_HEADER_Crypto_Ripemd160
diff --git a/src/Crypto/Serpent.c b/src/Crypto/Serpent.c
index 87d710c4..a8c528de 100644
--- a/src/Crypto/Serpent.c
+++ b/src/Crypto/Serpent.c
@@ -1,938 +1,938 @@
-// serpent.cpp - written and placed in the public domain by Wei Dai
-
-/* Adapted for TrueCrypt */
-/* Adapted for VeraCrypt */
-
-#ifdef TC_WINDOWS_BOOT
-#pragma optimize ("t", on)
-#endif
-
-#include "Serpent.h"
-#include "Common/Endian.h"
-
-#include <memory.h>
-
-#if defined(_WIN32) && !defined(_DEBUG)
-#include <stdlib.h>
-#define rotlFixed _rotl
-#define rotrFixed _rotr
-#else
-#define rotlFixed(x,n)   (((x) << (n)) | ((x) >> (32 - (n))))
-#define rotrFixed(x,n)   (((x) >> (n)) | ((x) << (32 - (n))))
-#endif
-
-// linear transformation
-#define LT(i,a,b,c,d,e)	{\
-	a = rotlFixed(a, 13);	\
-	c = rotlFixed(c, 3); 	\
-	d = rotlFixed(d ^ c ^ (a << 3), 7); 	\
-	b = rotlFixed(b ^ a ^ c, 1); 	\
-	a = rotlFixed(a ^ b ^ d, 5); 		\
-	c = rotlFixed(c ^ d ^ (b << 7), 22);}
-
-// inverse linear transformation
-#define ILT(i,a,b,c,d,e)	{\
-	c = rotrFixed(c, 22);	\
-	a = rotrFixed(a, 5); 	\
-	c ^= d ^ (b << 7);	\
-	a ^= b ^ d; 		\
-	b = rotrFixed(b, 1); 	\
-	d = rotrFixed(d, 7) ^ c ^ (a << 3);	\
-	b ^= a ^ c; 		\
-	c = rotrFixed(c, 3); 	\
-	a = rotrFixed(a, 13);}
-
-// order of output from S-box functions
-#define beforeS0(f) f(0,a,b,c,d,e)
-#define afterS0(f) f(1,b,e,c,a,d)
-#define afterS1(f) f(2,c,b,a,e,d)
-#define afterS2(f) f(3,a,e,b,d,c)
-#define afterS3(f) f(4,e,b,d,c,a)
-#define afterS4(f) f(5,b,a,e,c,d)
-#define afterS5(f) f(6,a,c,b,e,d)
-#define afterS6(f) f(7,a,c,d,b,e)
-#define afterS7(f) f(8,d,e,b,a,c)
-
-// order of output from inverse S-box functions
-#define beforeI7(f) f(8,a,b,c,d,e)
-#define afterI7(f) f(7,d,a,b,e,c)
-#define afterI6(f) f(6,a,b,c,e,d)
-#define afterI5(f) f(5,b,d,e,c,a)
-#define afterI4(f) f(4,b,c,e,a,d)
-#define afterI3(f) f(3,a,b,e,c,d)
-#define afterI2(f) f(2,b,d,e,c,a)
-#define afterI1(f) f(1,a,b,c,e,d)
-#define afterI0(f) f(0,a,d,b,e,c)
-
-// The instruction sequences for the S-box functions 
-// come from Dag Arne Osvik's paper "Speeding up Serpent".
-
-#define S0(i, r0, r1, r2, r3, r4) \
-       {           \
-    r3 ^= r0;   \
-    r4 = r1;   \
-    r1 &= r3;   \
-    r4 ^= r2;   \
-    r1 ^= r0;   \
-    r0 |= r3;   \
-    r0 ^= r4;   \
-    r4 ^= r3;   \
-    r3 ^= r2;   \
-    r2 |= r1;   \
-    r2 ^= r4;   \
-    r4 = ~r4;      \
-    r4 |= r1;   \
-    r1 ^= r3;   \
-    r1 ^= r4;   \
-    r3 |= r0;   \
-    r1 ^= r3;   \
-    r4 ^= r3;   \
-            }
-
-#define I0(i, r0, r1, r2, r3, r4) \
-       {           \
-    r2 = ~r2;      \
-    r4 = r1;   \
-    r1 |= r0;   \
-    r4 = ~r4;      \
-    r1 ^= r2;   \
-    r2 |= r4;   \
-    r1 ^= r3;   \
-    r0 ^= r4;   \
-    r2 ^= r0;   \
-    r0 &= r3;   \
-    r4 ^= r0;   \
-    r0 |= r1;   \
-    r0 ^= r2;   \
-    r3 ^= r4;   \
-    r2 ^= r1;   \
-    r3 ^= r0;   \
-    r3 ^= r1;   \
-    r2 &= r3;   \
-    r4 ^= r2;   \
-            }
-
-#define S1(i, r0, r1, r2, r3, r4) \
-       {           \
-    r0 = ~r0;      \
-    r2 = ~r2;      \
-    r4 = r0;   \
-    r0 &= r1;   \
-    r2 ^= r0;   \
-    r0 |= r3;   \
-    r3 ^= r2;   \
-    r1 ^= r0;   \
-    r0 ^= r4;   \
-    r4 |= r1;   \
-    r1 ^= r3;   \
-    r2 |= r0;   \
-    r2 &= r4;   \
-    r0 ^= r1;   \
-    r1 &= r2;   \
-    r1 ^= r0;   \
-    r0 &= r2;   \
-    r0 ^= r4;   \
-            }
-
-#define I1(i, r0, r1, r2, r3, r4) \
-       {           \
-    r4 = r1;   \
-    r1 ^= r3;   \
-    r3 &= r1;   \
-    r4 ^= r2;   \
-    r3 ^= r0;   \
-    r0 |= r1;   \
-    r2 ^= r3;   \
-    r0 ^= r4;   \
-    r0 |= r2;   \
-    r1 ^= r3;   \
-    r0 ^= r1;   \
-    r1 |= r3;   \
-    r1 ^= r0;   \
-    r4 = ~r4;      \
-    r4 ^= r1;   \
-    r1 |= r0;   \
-    r1 ^= r0;   \
-    r1 |= r4;   \
-    r3 ^= r1;   \
-            }
-
-#define S2(i, r0, r1, r2, r3, r4) \
-       {           \
-    r4 = r0;   \
-    r0 &= r2;   \
-    r0 ^= r3;   \
-    r2 ^= r1;   \
-    r2 ^= r0;   \
-    r3 |= r4;   \
-    r3 ^= r1;   \
-    r4 ^= r2;   \
-    r1 = r3;   \
-    r3 |= r4;   \
-    r3 ^= r0;   \
-    r0 &= r1;   \
-    r4 ^= r0;   \
-    r1 ^= r3;   \
-    r1 ^= r4;   \
-    r4 = ~r4;      \
-            }
-
-#define I2(i, r0, r1, r2, r3, r4) \
-       {           \
-    r2 ^= r3;   \
-    r3 ^= r0;   \
-    r4 = r3;   \
-    r3 &= r2;   \
-    r3 ^= r1;   \
-    r1 |= r2;   \
-    r1 ^= r4;   \
-    r4 &= r3;   \
-    r2 ^= r3;   \
-    r4 &= r0;   \
-    r4 ^= r2;   \
-    r2 &= r1;   \
-    r2 |= r0;   \
-    r3 = ~r3;      \
-    r2 ^= r3;   \
-    r0 ^= r3;   \
-    r0 &= r1;   \
-    r3 ^= r4;   \
-    r3 ^= r0;   \
-            }
-
-#define S3(i, r0, r1, r2, r3, r4) \
-       {           \
-    r4 = r0;   \
-    r0 |= r3;   \
-    r3 ^= r1;   \
-    r1 &= r4;   \
-    r4 ^= r2;   \
-    r2 ^= r3;   \
-    r3 &= r0;   \
-    r4 |= r1;   \
-    r3 ^= r4;   \
-    r0 ^= r1;   \
-    r4 &= r0;   \
-    r1 ^= r3;   \
-    r4 ^= r2;   \
-    r1 |= r0;   \
-    r1 ^= r2;   \
-    r0 ^= r3;   \
-    r2 = r1;   \
-    r1 |= r3;   \
-    r1 ^= r0;   \
-            }
-
-#define I3(i, r0, r1, r2, r3, r4) \
-       {           \
-    r4 = r2;   \
-    r2 ^= r1;   \
-    r1 &= r2;   \
-    r1 ^= r0;   \
-    r0 &= r4;   \
-    r4 ^= r3;   \
-    r3 |= r1;   \
-    r3 ^= r2;   \
-    r0 ^= r4;   \
-    r2 ^= r0;   \
-    r0 |= r3;   \
-    r0 ^= r1;   \
-    r4 ^= r2;   \
-    r2 &= r3;   \
-    r1 |= r3;   \
-    r1 ^= r2;   \
-    r4 ^= r0;   \
-    r2 ^= r4;   \
-            }
-
-#define S4(i, r0, r1, r2, r3, r4) \
-       {           \
-    r1 ^= r3;   \
-    r3 = ~r3;      \
-    r2 ^= r3;   \
-    r3 ^= r0;   \
-    r4 = r1;   \
-    r1 &= r3;   \
-    r1 ^= r2;   \
-    r4 ^= r3;   \
-    r0 ^= r4;   \
-    r2 &= r4;   \
-    r2 ^= r0;   \
-    r0 &= r1;   \
-    r3 ^= r0;   \
-    r4 |= r1;   \
-    r4 ^= r0;   \
-    r0 |= r3;   \
-    r0 ^= r2;   \
-    r2 &= r3;   \
-    r0 = ~r0;      \
-    r4 ^= r2;   \
-            }
-
-#define I4(i, r0, r1, r2, r3, r4) \
-       {           \
-    r4 = r2;   \
-    r2 &= r3;   \
-    r2 ^= r1;   \
-    r1 |= r3;   \
-    r1 &= r0;   \
-    r4 ^= r2;   \
-    r4 ^= r1;   \
-    r1 &= r2;   \
-    r0 = ~r0;      \
-    r3 ^= r4;   \
-    r1 ^= r3;   \
-    r3 &= r0;   \
-    r3 ^= r2;   \
-    r0 ^= r1;   \
-    r2 &= r0;   \
-    r3 ^= r0;   \
-    r2 ^= r4;   \
-    r2 |= r3;   \
-    r3 ^= r0;   \
-    r2 ^= r1;   \
-            }
-
-#define S5(i, r0, r1, r2, r3, r4) \
-       {           \
-    r0 ^= r1;   \
-    r1 ^= r3;   \
-    r3 = ~r3;      \
-    r4 = r1;   \
-    r1 &= r0;   \
-    r2 ^= r3;   \
-    r1 ^= r2;   \
-    r2 |= r4;   \
-    r4 ^= r3;   \
-    r3 &= r1;   \
-    r3 ^= r0;   \
-    r4 ^= r1;   \
-    r4 ^= r2;   \
-    r2 ^= r0;   \
-    r0 &= r3;   \
-    r2 = ~r2;      \
-    r0 ^= r4;   \
-    r4 |= r3;   \
-    r2 ^= r4;   \
-            }
-
-#define I5(i, r0, r1, r2, r3, r4) \
-       {           \
-    r1 = ~r1;      \
-    r4 = r3;   \
-    r2 ^= r1;   \
-    r3 |= r0;   \
-    r3 ^= r2;   \
-    r2 |= r1;   \
-    r2 &= r0;   \
-    r4 ^= r3;   \
-    r2 ^= r4;   \
-    r4 |= r0;   \
-    r4 ^= r1;   \
-    r1 &= r2;   \
-    r1 ^= r3;   \
-    r4 ^= r2;   \
-    r3 &= r4;   \
-    r4 ^= r1;   \
-    r3 ^= r0;   \
-    r3 ^= r4;   \
-    r4 = ~r4;      \
-            }
-
-#define S6(i, r0, r1, r2, r3, r4) \
-       {           \
-    r2 = ~r2;      \
-    r4 = r3;   \
-    r3 &= r0;   \
-    r0 ^= r4;   \
-    r3 ^= r2;   \
-    r2 |= r4;   \
-    r1 ^= r3;   \
-    r2 ^= r0;   \
-    r0 |= r1;   \
-    r2 ^= r1;   \
-    r4 ^= r0;   \
-    r0 |= r3;   \
-    r0 ^= r2;   \
-    r4 ^= r3;   \
-    r4 ^= r0;   \
-    r3 = ~r3;      \
-    r2 &= r4;   \
-    r2 ^= r3;   \
-            }
-
-#define I6(i, r0, r1, r2, r3, r4) \
-       {           \
-    r0 ^= r2;   \
-    r4 = r2;   \
-    r2 &= r0;   \
-    r4 ^= r3;   \
-    r2 = ~r2;      \
-    r3 ^= r1;   \
-    r2 ^= r3;   \
-    r4 |= r0;   \
-    r0 ^= r2;   \
-    r3 ^= r4;   \
-    r4 ^= r1;   \
-    r1 &= r3;   \
-    r1 ^= r0;   \
-    r0 ^= r3;   \
-    r0 |= r2;   \
-    r3 ^= r1;   \
-    r4 ^= r0;   \
-            }
-
-#define S7(i, r0, r1, r2, r3, r4) \
-       {           \
-    r4 = r2;   \
-    r2 &= r1;   \
-    r2 ^= r3;   \
-    r3 &= r1;   \
-    r4 ^= r2;   \
-    r2 ^= r1;   \
-    r1 ^= r0;   \
-    r0 |= r4;   \
-    r0 ^= r2;   \
-    r3 ^= r1;   \
-    r2 ^= r3;   \
-    r3 &= r0;   \
-    r3 ^= r4;   \
-    r4 ^= r2;   \
-    r2 &= r0;   \
-    r4 = ~r4;      \
-    r2 ^= r4;   \
-    r4 &= r0;   \
-    r1 ^= r3;   \
-    r4 ^= r1;   \
-            }
-
-#define I7(i, r0, r1, r2, r3, r4) \
-       {           \
-    r4 = r2;   \
-    r2 ^= r0;   \
-    r0 &= r3;   \
-    r2 = ~r2;      \
-    r4 |= r3;   \
-    r3 ^= r1;   \
-    r1 |= r0;   \
-    r0 ^= r2;   \
-    r2 &= r4;   \
-    r1 ^= r2;   \
-    r2 ^= r0;   \
-    r0 |= r2;   \
-    r3 &= r4;   \
-    r0 ^= r3;   \
-    r4 ^= r1;   \
-    r3 ^= r4;   \
-    r4 |= r0;   \
-    r3 ^= r2;   \
-    r4 ^= r2;   \
-            }
-
-// key xor
-#define KX(r, a, b, c, d, e)	{\
-	a ^= k[4 * r + 0]; \
-	b ^= k[4 * r + 1]; \
-	c ^= k[4 * r + 2]; \
-	d ^= k[4 * r + 3];}
-
-
-#ifdef TC_MINIMIZE_CODE_SIZE
-
-static void S0f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
-{
-	*r3 ^= *r0;
-	*r4 = *r1;
-	*r1 &= *r3;
-	*r4 ^= *r2;
-	*r1 ^= *r0;
-	*r0 |= *r3;
-	*r0 ^= *r4;
-	*r4 ^= *r3;
-	*r3 ^= *r2;
-	*r2 |= *r1;
-	*r2 ^= *r4;
-	*r4 = ~*r4;
-	*r4 |= *r1;
-	*r1 ^= *r3;
-	*r1 ^= *r4;
-	*r3 |= *r0;
-	*r1 ^= *r3;
-	*r4 ^= *r3;
-}
-
-static void S1f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
-{        
-    *r0 = ~*r0;   
-    *r2 = ~*r2;   
-    *r4 = *r0;
-    *r0 &= *r1;
-    *r2 ^= *r0;
-    *r0 |= *r3;
-    *r3 ^= *r2;
-    *r1 ^= *r0;
-    *r0 ^= *r4;
-    *r4 |= *r1;
-    *r1 ^= *r3;
-    *r2 |= *r0;
-    *r2 &= *r4;
-    *r0 ^= *r1;
-    *r1 &= *r2;
-    *r1 ^= *r0;
-    *r0 &= *r2;
-    *r0 ^= *r4;
-}
-
-static void S2f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
-{        
-	*r4 = *r0;
-	*r0 &= *r2;
-	*r0 ^= *r3;
-	*r2 ^= *r1;
-	*r2 ^= *r0;
-	*r3 |= *r4;
-	*r3 ^= *r1;
-	*r4 ^= *r2;
-	*r1 = *r3;
-	*r3 |= *r4;
-	*r3 ^= *r0;
-	*r0 &= *r1;
-	*r4 ^= *r0;
-	*r1 ^= *r3;
-	*r1 ^= *r4;
-	*r4 = ~*r4;   
-}
-
-static void S3f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
-{        
-	*r4 = *r0;
-	*r0 |= *r3;
-	*r3 ^= *r1;
-	*r1 &= *r4;
-	*r4 ^= *r2;
-	*r2 ^= *r3;
-	*r3 &= *r0;
-	*r4 |= *r1;
-	*r3 ^= *r4;
-	*r0 ^= *r1;
-	*r4 &= *r0;
-	*r1 ^= *r3;
-	*r4 ^= *r2;
-	*r1 |= *r0;
-	*r1 ^= *r2;
-	*r0 ^= *r3;
-	*r2 = *r1;
-	*r1 |= *r3;
-	*r1 ^= *r0;
-}
-
-static void S4f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
-{        
-	*r1 ^= *r3;
-	*r3 = ~*r3;   
-	*r2 ^= *r3;
-	*r3 ^= *r0;
-	*r4 = *r1;
-	*r1 &= *r3;
-	*r1 ^= *r2;
-	*r4 ^= *r3;
-	*r0 ^= *r4;
-	*r2 &= *r4;
-	*r2 ^= *r0;
-	*r0 &= *r1;
-	*r3 ^= *r0;
-	*r4 |= *r1;
-	*r4 ^= *r0;
-	*r0 |= *r3;
-	*r0 ^= *r2;
-	*r2 &= *r3;
-	*r0 = ~*r0;   
-	*r4 ^= *r2;
-}
-
-static void S5f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
-{        
-	*r0 ^= *r1;
-	*r1 ^= *r3;
-	*r3 = ~*r3;   
-	*r4 = *r1;
-	*r1 &= *r0;
-	*r2 ^= *r3;
-	*r1 ^= *r2;
-	*r2 |= *r4;
-	*r4 ^= *r3;
-	*r3 &= *r1;
-	*r3 ^= *r0;
-	*r4 ^= *r1;
-	*r4 ^= *r2;
-	*r2 ^= *r0;
-	*r0 &= *r3;
-	*r2 = ~*r2;   
-	*r0 ^= *r4;
-	*r4 |= *r3;
-	*r2 ^= *r4;
-}
-
-static void S6f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
-{        
-	*r2 = ~*r2;   
-	*r4 = *r3;
-	*r3 &= *r0;
-	*r0 ^= *r4;
-	*r3 ^= *r2;
-	*r2 |= *r4;
-	*r1 ^= *r3;
-	*r2 ^= *r0;
-	*r0 |= *r1;
-	*r2 ^= *r1;
-	*r4 ^= *r0;
-	*r0 |= *r3;
-	*r0 ^= *r2;
-	*r4 ^= *r3;
-	*r4 ^= *r0;
-	*r3 = ~*r3;   
-	*r2 &= *r4;
-	*r2 ^= *r3;
-}
-
-static void S7f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
-{        
-	*r4 = *r2;
-	*r2 &= *r1;
-	*r2 ^= *r3;
-	*r3 &= *r1;
-	*r4 ^= *r2;
-	*r2 ^= *r1;
-	*r1 ^= *r0;
-	*r0 |= *r4;
-	*r0 ^= *r2;
-	*r3 ^= *r1;
-	*r2 ^= *r3;
-	*r3 &= *r0;
-	*r3 ^= *r4;
-	*r4 ^= *r2;
-	*r2 &= *r0;
-	*r4 = ~*r4;   
-	*r2 ^= *r4;
-	*r4 &= *r0;
-	*r1 ^= *r3;
-	*r4 ^= *r1;
-}
-
-static void KXf (const unsigned __int32 *k, unsigned int r, unsigned __int32 *a, unsigned __int32 *b, unsigned __int32 *c, unsigned __int32 *d)
-{
-	*a ^= k[r];
-	*b ^= k[r + 1];
-	*c ^= k[r + 2];
-	*d ^= k[r + 3];
-}
-
-#endif // TC_MINIMIZE_CODE_SIZE
-
-#ifndef TC_MINIMIZE_CODE_SIZE
-
-void serpent_set_key(const unsigned __int8 userKey[],unsigned __int8 *ks)
-{
-	unsigned __int32 a,b,c,d,e;
-	unsigned __int32 *k = (unsigned __int32 *)ks;
-	unsigned __int32 t;
-	int i;
-
-	for (i = 0; i < 8; i++)
-		k[i] = LE32(((unsigned __int32*)userKey)[i]);
-
-	k += 8;
-	t = k[-1];
-	for (i = 0; i < 132; ++i)
-		k[i] = t = rotlFixed(k[i-8] ^ k[i-5] ^ k[i-3] ^ t ^ 0x9e3779b9 ^ i, 11);
-	k -= 20;
-
-#define LK(r, a, b, c, d, e)	{\
-	a = k[(8-r)*4 + 0];		\
-	b = k[(8-r)*4 + 1];		\
-	c = k[(8-r)*4 + 2];		\
-	d = k[(8-r)*4 + 3];}
-
-#define SK(r, a, b, c, d, e)	{\
-	k[(8-r)*4 + 4] = a;		\
-	k[(8-r)*4 + 5] = b;		\
-	k[(8-r)*4 + 6] = c;		\
-	k[(8-r)*4 + 7] = d;}	\
-
-	for (i=0; i<4; i++)
-	{
-		afterS2(LK); afterS2(S3); afterS3(SK);
-		afterS1(LK); afterS1(S2); afterS2(SK);
-		afterS0(LK); afterS0(S1); afterS1(SK);
-		beforeS0(LK); beforeS0(S0); afterS0(SK);
-		k += 8*4;
-		afterS6(LK); afterS6(S7); afterS7(SK);
-		afterS5(LK); afterS5(S6); afterS6(SK);
-		afterS4(LK); afterS4(S5); afterS5(SK);
-		afterS3(LK); afterS3(S4); afterS4(SK);
-	}
-	afterS2(LK); afterS2(S3); afterS3(SK);
-}
-
-#else // TC_MINIMIZE_CODE_SIZE
-
-static void LKf (unsigned __int32 *k, unsigned int r, unsigned __int32 *a, unsigned __int32 *b, unsigned __int32 *c, unsigned __int32 *d)
-{
-	*a = k[r];
-	*b = k[r + 1];
-	*c = k[r + 2];
-	*d = k[r + 3];
-}
-
-static void SKf (unsigned __int32 *k, unsigned int r, unsigned __int32 *a, unsigned __int32 *b, unsigned __int32 *c, unsigned __int32 *d)
-{
-	k[r + 4] = *a;
-	k[r + 5] = *b;
-	k[r + 6] = *c;
-	k[r + 7] = *d;
-}
-
-void serpent_set_key(const unsigned __int8 userKey[], unsigned __int8 *ks)
-{
-	unsigned __int32 a,b,c,d,e;
-	unsigned __int32 *k = (unsigned __int32 *)ks;
-	unsigned __int32 t;
-	int i;
-
-	for (i = 0; i < 8; i++)
-		k[i] = LE32(((unsigned __int32*)userKey)[i]);
-
-	k += 8;
-	t = k[-1];
-	for (i = 0; i < 132; ++i)
-		k[i] = t = rotlFixed(k[i-8] ^ k[i-5] ^ k[i-3] ^ t ^ 0x9e3779b9 ^ i, 11);
-	k -= 20;
-
-	for (i=0; i<4; i++)
-	{
-		LKf (k, 20, &a, &e, &b, &d); S3f (&a, &e, &b, &d, &c); SKf (k, 16, &e, &b, &d, &c);
-		LKf (k, 24, &c, &b, &a, &e); S2f (&c, &b, &a, &e, &d); SKf (k, 20, &a, &e, &b, &d);
-		LKf (k, 28, &b, &e, &c, &a); S1f (&b, &e, &c, &a, &d); SKf (k, 24, &c, &b, &a, &e);
-		LKf (k, 32, &a, &b, &c, &d); S0f (&a, &b, &c, &d, &e); SKf (k, 28, &b, &e, &c, &a);
-		k += 8*4;
-		LKf (k,  4, &a, &c, &d, &b); S7f (&a, &c, &d, &b, &e); SKf (k,  0, &d, &e, &b, &a);
-		LKf (k,  8, &a, &c, &b, &e); S6f (&a, &c, &b, &e, &d); SKf (k,  4, &a, &c, &d, &b);
-		LKf (k, 12, &b, &a, &e, &c); S5f (&b, &a, &e, &c, &d); SKf (k,  8, &a, &c, &b, &e);
-		LKf (k, 16, &e, &b, &d, &c); S4f (&e, &b, &d, &c, &a); SKf (k, 12, &b, &a, &e, &c);
-	}
-	LKf (k, 20, &a, &e, &b, &d); S3f (&a, &e, &b, &d, &c); SKf (k, 16, &e, &b, &d, &c);
-}
-
-#endif // TC_MINIMIZE_CODE_SIZE
-
-
-#ifndef TC_MINIMIZE_CODE_SIZE
-
-void serpent_encrypt(const unsigned __int8 *inBlock, unsigned __int8 *outBlock, unsigned __int8 *ks)
-{
-	unsigned __int32 a, b, c, d, e;
-	unsigned int i=1;
-	const unsigned __int32 *k = (unsigned __int32 *)ks + 8;
-	unsigned __int32 *in = (unsigned __int32 *) inBlock;
-	unsigned __int32 *out = (unsigned __int32 *) outBlock;
-
-    a = LE32(in[0]);
-	b = LE32(in[1]);
-	c = LE32(in[2]);
-	d = LE32(in[3]);
-
-	do
-	{
-		beforeS0(KX); beforeS0(S0); afterS0(LT);
-		afterS0(KX); afterS0(S1); afterS1(LT);
-		afterS1(KX); afterS1(S2); afterS2(LT);
-		afterS2(KX); afterS2(S3); afterS3(LT);
-		afterS3(KX); afterS3(S4); afterS4(LT);
-		afterS4(KX); afterS4(S5); afterS5(LT);
-		afterS5(KX); afterS5(S6); afterS6(LT);
-		afterS6(KX); afterS6(S7);
-
-		if (i == 4)
-			break;
-
-		++i;
-		c = b;
-		b = e;
-		e = d;
-		d = a;
-		a = e;
-		k += 32;
-		beforeS0(LT);
-	}
-	while (1);
-
-	afterS7(KX);
-	
-    out[0] = LE32(d);
-	out[1] = LE32(e);
-	out[2] = LE32(b);
-	out[3] = LE32(a);
-}
-
-#else // TC_MINIMIZE_CODE_SIZE
-
-typedef unsigned __int32 uint32;
-
-static void LTf (uint32 *a, uint32 *b, uint32 *c, uint32 *d)
-{
-	*a = rotlFixed(*a, 13);
-	*c = rotlFixed(*c, 3);
-	*d = rotlFixed(*d ^ *c ^ (*a << 3), 7);
-	*b = rotlFixed(*b ^ *a ^ *c, 1);
-	*a = rotlFixed(*a ^ *b ^ *d, 5);
-	*c = rotlFixed(*c ^ *d ^ (*b << 7), 22);
-}
-
-void serpent_encrypt(const unsigned __int8 *inBlock, unsigned __int8 *outBlock, unsigned __int8 *ks)
-{
-	unsigned __int32 a, b, c, d, e;
-	unsigned int i=1;
-	const unsigned __int32 *k = (unsigned __int32 *)ks + 8;
-	unsigned __int32 *in = (unsigned __int32 *) inBlock;
-	unsigned __int32 *out = (unsigned __int32 *) outBlock;
-
-    a = LE32(in[0]);
-	b = LE32(in[1]);
-	c = LE32(in[2]);
-	d = LE32(in[3]);
-
-	do
-	{
-		KXf (k,  0, &a, &b, &c, &d); S0f (&a, &b, &c, &d, &e); LTf (&b, &e, &c, &a);
-		KXf (k,  4, &b, &e, &c, &a); S1f (&b, &e, &c, &a, &d); LTf (&c, &b, &a, &e);
-		KXf (k,  8, &c, &b, &a, &e); S2f (&c, &b, &a, &e, &d); LTf (&a, &e, &b, &d);
-		KXf (k, 12, &a, &e, &b, &d); S3f (&a, &e, &b, &d, &c); LTf (&e, &b, &d, &c);
-		KXf (k, 16, &e, &b, &d, &c); S4f (&e, &b, &d, &c, &a); LTf (&b, &a, &e, &c);
-		KXf (k, 20, &b, &a, &e, &c); S5f (&b, &a, &e, &c, &d); LTf (&a, &c, &b, &e);
-		KXf (k, 24, &a, &c, &b, &e); S6f (&a, &c, &b, &e, &d); LTf (&a, &c, &d, &b);
-		KXf (k, 28, &a, &c, &d, &b); S7f (&a, &c, &d, &b, &e);
-
-		if (i == 4)
-			break;
-
-		++i;
-		c = b;
-		b = e;
-		e = d;
-		d = a;
-		a = e;
-		k += 32;
-		LTf (&a,&b,&c,&d);
-	}
-	while (1);
-
-	KXf (k, 32, &d, &e, &b, &a);
-	
-    out[0] = LE32(d);
-	out[1] = LE32(e);
-	out[2] = LE32(b);
-	out[3] = LE32(a);
-}
-
-#endif // TC_MINIMIZE_CODE_SIZE
-
-#if !defined (TC_MINIMIZE_CODE_SIZE)
-
-void serpent_decrypt(const unsigned __int8 *inBlock, unsigned __int8 *outBlock, unsigned __int8 *ks)
-{
-	unsigned __int32 a, b, c, d, e;
-	const unsigned __int32 *k = (unsigned __int32 *)ks + 104;
-	unsigned int i=4;
-	unsigned __int32 *in = (unsigned __int32 *) inBlock;
-	unsigned __int32 *out = (unsigned __int32 *) outBlock;
-
-    a = LE32(in[0]);
-	b = LE32(in[1]);
-	c = LE32(in[2]);
-	d = LE32(in[3]);
-
-	beforeI7(KX);
-	goto start;
-
-	do
-	{
-		c = b;
-		b = d;
-		d = e;
-		k -= 32;
-		beforeI7(ILT);
-start:
-		beforeI7(I7); afterI7(KX); 
-		afterI7(ILT); afterI7(I6); afterI6(KX); 
-		afterI6(ILT); afterI6(I5); afterI5(KX); 
-		afterI5(ILT); afterI5(I4); afterI4(KX); 
-		afterI4(ILT); afterI4(I3); afterI3(KX); 
-		afterI3(ILT); afterI3(I2); afterI2(KX); 
-		afterI2(ILT); afterI2(I1); afterI1(KX); 
-		afterI1(ILT); afterI1(I0); afterI0(KX);
-	}
-	while (--i != 0);
-	
-    out[0] = LE32(a);
-	out[1] = LE32(d);
-	out[2] = LE32(b);
-	out[3] = LE32(e);
-}
-
-#else // TC_MINIMIZE_CODE_SIZE
-
-static void ILTf (uint32 *a, uint32 *b, uint32 *c, uint32 *d)
-{ 
-	*c = rotrFixed(*c, 22);
-	*a = rotrFixed(*a, 5);
-	*c ^= *d ^ (*b << 7);
-	*a ^= *b ^ *d;
-	*b = rotrFixed(*b, 1);
-	*d = rotrFixed(*d, 7) ^ *c ^ (*a << 3);
-	*b ^= *a ^ *c;
-	*c = rotrFixed(*c, 3);
-	*a = rotrFixed(*a, 13);
-}
-
-void serpent_decrypt(const unsigned __int8 *inBlock, unsigned __int8 *outBlock, unsigned __int8 *ks)
-{
-	unsigned __int32 a, b, c, d, e;
-	const unsigned __int32 *k = (unsigned __int32 *)ks + 104;
-	unsigned int i=4;
-	unsigned __int32 *in = (unsigned __int32 *) inBlock;
-	unsigned __int32 *out = (unsigned __int32 *) outBlock;
-
-    a = LE32(in[0]);
-	b = LE32(in[1]);
-	c = LE32(in[2]);
-	d = LE32(in[3]);
-
-	KXf (k, 32, &a, &b, &c, &d);
-	goto start;
-
-	do
-	{
-		c = b;
-		b = d;
-		d = e;
-		k -= 32;
-		beforeI7(ILT);
-start:
-		beforeI7(I7); KXf (k, 28, &d, &a, &b, &e);
-		ILTf (&d, &a, &b, &e); afterI7(I6); KXf (k, 24, &a, &b, &c, &e); 
-		ILTf (&a, &b, &c, &e); afterI6(I5); KXf (k, 20, &b, &d, &e, &c); 
-		ILTf (&b, &d, &e, &c); afterI5(I4); KXf (k, 16, &b, &c, &e, &a); 
-		ILTf (&b, &c, &e, &a); afterI4(I3); KXf (k, 12, &a, &b, &e, &c);
-		ILTf (&a, &b, &e, &c); afterI3(I2); KXf (k, 8,  &b, &d, &e, &c);
-		ILTf (&b, &d, &e, &c); afterI2(I1); KXf (k, 4,  &a, &b, &c, &e);
-		ILTf (&a, &b, &c, &e); afterI1(I0); KXf (k, 0,  &a, &d, &b, &e);
-	}
-	while (--i != 0);
-	
-    out[0] = LE32(a);
-	out[1] = LE32(d);
-	out[2] = LE32(b);
-	out[3] = LE32(e);
-}
-
-#endif // TC_MINIMIZE_CODE_SIZE
+// serpent.cpp - written and placed in the public domain by Wei Dai
+
+/* Adapted for TrueCrypt */
+/* Adapted for VeraCrypt */
+
+#ifdef TC_WINDOWS_BOOT
+#pragma optimize ("t", on)
+#endif
+
+#include "Serpent.h"
+#include "Common/Endian.h"
+
+#include <memory.h>
+
+#if defined(_WIN32) && !defined(_DEBUG)
+#include <stdlib.h>
+#define rotlFixed _rotl
+#define rotrFixed _rotr
+#else
+#define rotlFixed(x,n)   (((x) << (n)) | ((x) >> (32 - (n))))
+#define rotrFixed(x,n)   (((x) >> (n)) | ((x) << (32 - (n))))
+#endif
+
+// linear transformation
+#define LT(i,a,b,c,d,e)	{\
+	a = rotlFixed(a, 13);	\
+	c = rotlFixed(c, 3); 	\
+	d = rotlFixed(d ^ c ^ (a << 3), 7); 	\
+	b = rotlFixed(b ^ a ^ c, 1); 	\
+	a = rotlFixed(a ^ b ^ d, 5); 		\
+	c = rotlFixed(c ^ d ^ (b << 7), 22);}
+
+// inverse linear transformation
+#define ILT(i,a,b,c,d,e)	{\
+	c = rotrFixed(c, 22);	\
+	a = rotrFixed(a, 5); 	\
+	c ^= d ^ (b << 7);	\
+	a ^= b ^ d; 		\
+	b = rotrFixed(b, 1); 	\
+	d = rotrFixed(d, 7) ^ c ^ (a << 3);	\
+	b ^= a ^ c; 		\
+	c = rotrFixed(c, 3); 	\
+	a = rotrFixed(a, 13);}
+
+// order of output from S-box functions
+#define beforeS0(f) f(0,a,b,c,d,e)
+#define afterS0(f) f(1,b,e,c,a,d)
+#define afterS1(f) f(2,c,b,a,e,d)
+#define afterS2(f) f(3,a,e,b,d,c)
+#define afterS3(f) f(4,e,b,d,c,a)
+#define afterS4(f) f(5,b,a,e,c,d)
+#define afterS5(f) f(6,a,c,b,e,d)
+#define afterS6(f) f(7,a,c,d,b,e)
+#define afterS7(f) f(8,d,e,b,a,c)
+
+// order of output from inverse S-box functions
+#define beforeI7(f) f(8,a,b,c,d,e)
+#define afterI7(f) f(7,d,a,b,e,c)
+#define afterI6(f) f(6,a,b,c,e,d)
+#define afterI5(f) f(5,b,d,e,c,a)
+#define afterI4(f) f(4,b,c,e,a,d)
+#define afterI3(f) f(3,a,b,e,c,d)
+#define afterI2(f) f(2,b,d,e,c,a)
+#define afterI1(f) f(1,a,b,c,e,d)
+#define afterI0(f) f(0,a,d,b,e,c)
+
+// The instruction sequences for the S-box functions 
+// come from Dag Arne Osvik's paper "Speeding up Serpent".
+
+#define S0(i, r0, r1, r2, r3, r4) \
+       {           \
+    r3 ^= r0;   \
+    r4 = r1;   \
+    r1 &= r3;   \
+    r4 ^= r2;   \
+    r1 ^= r0;   \
+    r0 |= r3;   \
+    r0 ^= r4;   \
+    r4 ^= r3;   \
+    r3 ^= r2;   \
+    r2 |= r1;   \
+    r2 ^= r4;   \
+    r4 = ~r4;      \
+    r4 |= r1;   \
+    r1 ^= r3;   \
+    r1 ^= r4;   \
+    r3 |= r0;   \
+    r1 ^= r3;   \
+    r4 ^= r3;   \
+            }
+
+#define I0(i, r0, r1, r2, r3, r4) \
+       {           \
+    r2 = ~r2;      \
+    r4 = r1;   \
+    r1 |= r0;   \
+    r4 = ~r4;      \
+    r1 ^= r2;   \
+    r2 |= r4;   \
+    r1 ^= r3;   \
+    r0 ^= r4;   \
+    r2 ^= r0;   \
+    r0 &= r3;   \
+    r4 ^= r0;   \
+    r0 |= r1;   \
+    r0 ^= r2;   \
+    r3 ^= r4;   \
+    r2 ^= r1;   \
+    r3 ^= r0;   \
+    r3 ^= r1;   \
+    r2 &= r3;   \
+    r4 ^= r2;   \
+            }
+
+#define S1(i, r0, r1, r2, r3, r4) \
+       {           \
+    r0 = ~r0;      \
+    r2 = ~r2;      \
+    r4 = r0;   \
+    r0 &= r1;   \
+    r2 ^= r0;   \
+    r0 |= r3;   \
+    r3 ^= r2;   \
+    r1 ^= r0;   \
+    r0 ^= r4;   \
+    r4 |= r1;   \
+    r1 ^= r3;   \
+    r2 |= r0;   \
+    r2 &= r4;   \
+    r0 ^= r1;   \
+    r1 &= r2;   \
+    r1 ^= r0;   \
+    r0 &= r2;   \
+    r0 ^= r4;   \
+            }
+
+#define I1(i, r0, r1, r2, r3, r4) \
+       {           \
+    r4 = r1;   \
+    r1 ^= r3;   \
+    r3 &= r1;   \
+    r4 ^= r2;   \
+    r3 ^= r0;   \
+    r0 |= r1;   \
+    r2 ^= r3;   \
+    r0 ^= r4;   \
+    r0 |= r2;   \
+    r1 ^= r3;   \
+    r0 ^= r1;   \
+    r1 |= r3;   \
+    r1 ^= r0;   \
+    r4 = ~r4;      \
+    r4 ^= r1;   \
+    r1 |= r0;   \
+    r1 ^= r0;   \
+    r1 |= r4;   \
+    r3 ^= r1;   \
+            }
+
+#define S2(i, r0, r1, r2, r3, r4) \
+       {           \
+    r4 = r0;   \
+    r0 &= r2;   \
+    r0 ^= r3;   \
+    r2 ^= r1;   \
+    r2 ^= r0;   \
+    r3 |= r4;   \
+    r3 ^= r1;   \
+    r4 ^= r2;   \
+    r1 = r3;   \
+    r3 |= r4;   \
+    r3 ^= r0;   \
+    r0 &= r1;   \
+    r4 ^= r0;   \
+    r1 ^= r3;   \
+    r1 ^= r4;   \
+    r4 = ~r4;      \
+            }
+
+#define I2(i, r0, r1, r2, r3, r4) \
+       {           \
+    r2 ^= r3;   \
+    r3 ^= r0;   \
+    r4 = r3;   \
+    r3 &= r2;   \
+    r3 ^= r1;   \
+    r1 |= r2;   \
+    r1 ^= r4;   \
+    r4 &= r3;   \
+    r2 ^= r3;   \
+    r4 &= r0;   \
+    r4 ^= r2;   \
+    r2 &= r1;   \
+    r2 |= r0;   \
+    r3 = ~r3;      \
+    r2 ^= r3;   \
+    r0 ^= r3;   \
+    r0 &= r1;   \
+    r3 ^= r4;   \
+    r3 ^= r0;   \
+            }
+
+#define S3(i, r0, r1, r2, r3, r4) \
+       {           \
+    r4 = r0;   \
+    r0 |= r3;   \
+    r3 ^= r1;   \
+    r1 &= r4;   \
+    r4 ^= r2;   \
+    r2 ^= r3;   \
+    r3 &= r0;   \
+    r4 |= r1;   \
+    r3 ^= r4;   \
+    r0 ^= r1;   \
+    r4 &= r0;   \
+    r1 ^= r3;   \
+    r4 ^= r2;   \
+    r1 |= r0;   \
+    r1 ^= r2;   \
+    r0 ^= r3;   \
+    r2 = r1;   \
+    r1 |= r3;   \
+    r1 ^= r0;   \
+            }
+
+#define I3(i, r0, r1, r2, r3, r4) \
+       {           \
+    r4 = r2;   \
+    r2 ^= r1;   \
+    r1 &= r2;   \
+    r1 ^= r0;   \
+    r0 &= r4;   \
+    r4 ^= r3;   \
+    r3 |= r1;   \
+    r3 ^= r2;   \
+    r0 ^= r4;   \
+    r2 ^= r0;   \
+    r0 |= r3;   \
+    r0 ^= r1;   \
+    r4 ^= r2;   \
+    r2 &= r3;   \
+    r1 |= r3;   \
+    r1 ^= r2;   \
+    r4 ^= r0;   \
+    r2 ^= r4;   \
+            }
+
+#define S4(i, r0, r1, r2, r3, r4) \
+       {           \
+    r1 ^= r3;   \
+    r3 = ~r3;      \
+    r2 ^= r3;   \
+    r3 ^= r0;   \
+    r4 = r1;   \
+    r1 &= r3;   \
+    r1 ^= r2;   \
+    r4 ^= r3;   \
+    r0 ^= r4;   \
+    r2 &= r4;   \
+    r2 ^= r0;   \
+    r0 &= r1;   \
+    r3 ^= r0;   \
+    r4 |= r1;   \
+    r4 ^= r0;   \
+    r0 |= r3;   \
+    r0 ^= r2;   \
+    r2 &= r3;   \
+    r0 = ~r0;      \
+    r4 ^= r2;   \
+            }
+
+#define I4(i, r0, r1, r2, r3, r4) \
+       {           \
+    r4 = r2;   \
+    r2 &= r3;   \
+    r2 ^= r1;   \
+    r1 |= r3;   \
+    r1 &= r0;   \
+    r4 ^= r2;   \
+    r4 ^= r1;   \
+    r1 &= r2;   \
+    r0 = ~r0;      \
+    r3 ^= r4;   \
+    r1 ^= r3;   \
+    r3 &= r0;   \
+    r3 ^= r2;   \
+    r0 ^= r1;   \
+    r2 &= r0;   \
+    r3 ^= r0;   \
+    r2 ^= r4;   \
+    r2 |= r3;   \
+    r3 ^= r0;   \
+    r2 ^= r1;   \
+            }
+
+#define S5(i, r0, r1, r2, r3, r4) \
+       {           \
+    r0 ^= r1;   \
+    r1 ^= r3;   \
+    r3 = ~r3;      \
+    r4 = r1;   \
+    r1 &= r0;   \
+    r2 ^= r3;   \
+    r1 ^= r2;   \
+    r2 |= r4;   \
+    r4 ^= r3;   \
+    r3 &= r1;   \
+    r3 ^= r0;   \
+    r4 ^= r1;   \
+    r4 ^= r2;   \
+    r2 ^= r0;   \
+    r0 &= r3;   \
+    r2 = ~r2;      \
+    r0 ^= r4;   \
+    r4 |= r3;   \
+    r2 ^= r4;   \
+            }
+
+#define I5(i, r0, r1, r2, r3, r4) \
+       {           \
+    r1 = ~r1;      \
+    r4 = r3;   \
+    r2 ^= r1;   \
+    r3 |= r0;   \
+    r3 ^= r2;   \
+    r2 |= r1;   \
+    r2 &= r0;   \
+    r4 ^= r3;   \
+    r2 ^= r4;   \
+    r4 |= r0;   \
+    r4 ^= r1;   \
+    r1 &= r2;   \
+    r1 ^= r3;   \
+    r4 ^= r2;   \
+    r3 &= r4;   \
+    r4 ^= r1;   \
+    r3 ^= r0;   \
+    r3 ^= r4;   \
+    r4 = ~r4;      \
+            }
+
+#define S6(i, r0, r1, r2, r3, r4) \
+       {           \
+    r2 = ~r2;      \
+    r4 = r3;   \
+    r3 &= r0;   \
+    r0 ^= r4;   \
+    r3 ^= r2;   \
+    r2 |= r4;   \
+    r1 ^= r3;   \
+    r2 ^= r0;   \
+    r0 |= r1;   \
+    r2 ^= r1;   \
+    r4 ^= r0;   \
+    r0 |= r3;   \
+    r0 ^= r2;   \
+    r4 ^= r3;   \
+    r4 ^= r0;   \
+    r3 = ~r3;      \
+    r2 &= r4;   \
+    r2 ^= r3;   \
+            }
+
+#define I6(i, r0, r1, r2, r3, r4) \
+       {           \
+    r0 ^= r2;   \
+    r4 = r2;   \
+    r2 &= r0;   \
+    r4 ^= r3;   \
+    r2 = ~r2;      \
+    r3 ^= r1;   \
+    r2 ^= r3;   \
+    r4 |= r0;   \
+    r0 ^= r2;   \
+    r3 ^= r4;   \
+    r4 ^= r1;   \
+    r1 &= r3;   \
+    r1 ^= r0;   \
+    r0 ^= r3;   \
+    r0 |= r2;   \
+    r3 ^= r1;   \
+    r4 ^= r0;   \
+            }
+
+#define S7(i, r0, r1, r2, r3, r4) \
+       {           \
+    r4 = r2;   \
+    r2 &= r1;   \
+    r2 ^= r3;   \
+    r3 &= r1;   \
+    r4 ^= r2;   \
+    r2 ^= r1;   \
+    r1 ^= r0;   \
+    r0 |= r4;   \
+    r0 ^= r2;   \
+    r3 ^= r1;   \
+    r2 ^= r3;   \
+    r3 &= r0;   \
+    r3 ^= r4;   \
+    r4 ^= r2;   \
+    r2 &= r0;   \
+    r4 = ~r4;      \
+    r2 ^= r4;   \
+    r4 &= r0;   \
+    r1 ^= r3;   \
+    r4 ^= r1;   \
+            }
+
+#define I7(i, r0, r1, r2, r3, r4) \
+       {           \
+    r4 = r2;   \
+    r2 ^= r0;   \
+    r0 &= r3;   \
+    r2 = ~r2;      \
+    r4 |= r3;   \
+    r3 ^= r1;   \
+    r1 |= r0;   \
+    r0 ^= r2;   \
+    r2 &= r4;   \
+    r1 ^= r2;   \
+    r2 ^= r0;   \
+    r0 |= r2;   \
+    r3 &= r4;   \
+    r0 ^= r3;   \
+    r4 ^= r1;   \
+    r3 ^= r4;   \
+    r4 |= r0;   \
+    r3 ^= r2;   \
+    r4 ^= r2;   \
+            }
+
+// key xor
+#define KX(r, a, b, c, d, e)	{\
+	a ^= k[4 * r + 0]; \
+	b ^= k[4 * r + 1]; \
+	c ^= k[4 * r + 2]; \
+	d ^= k[4 * r + 3];}
+
+
+#ifdef TC_MINIMIZE_CODE_SIZE
+
+static void S0f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
+{
+	*r3 ^= *r0;
+	*r4 = *r1;
+	*r1 &= *r3;
+	*r4 ^= *r2;
+	*r1 ^= *r0;
+	*r0 |= *r3;
+	*r0 ^= *r4;
+	*r4 ^= *r3;
+	*r3 ^= *r2;
+	*r2 |= *r1;
+	*r2 ^= *r4;
+	*r4 = ~*r4;
+	*r4 |= *r1;
+	*r1 ^= *r3;
+	*r1 ^= *r4;
+	*r3 |= *r0;
+	*r1 ^= *r3;
+	*r4 ^= *r3;
+}
+
+static void S1f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
+{        
+    *r0 = ~*r0;   
+    *r2 = ~*r2;   
+    *r4 = *r0;
+    *r0 &= *r1;
+    *r2 ^= *r0;
+    *r0 |= *r3;
+    *r3 ^= *r2;
+    *r1 ^= *r0;
+    *r0 ^= *r4;
+    *r4 |= *r1;
+    *r1 ^= *r3;
+    *r2 |= *r0;
+    *r2 &= *r4;
+    *r0 ^= *r1;
+    *r1 &= *r2;
+    *r1 ^= *r0;
+    *r0 &= *r2;
+    *r0 ^= *r4;
+}
+
+static void S2f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
+{        
+	*r4 = *r0;
+	*r0 &= *r2;
+	*r0 ^= *r3;
+	*r2 ^= *r1;
+	*r2 ^= *r0;
+	*r3 |= *r4;
+	*r3 ^= *r1;
+	*r4 ^= *r2;
+	*r1 = *r3;
+	*r3 |= *r4;
+	*r3 ^= *r0;
+	*r0 &= *r1;
+	*r4 ^= *r0;
+	*r1 ^= *r3;
+	*r1 ^= *r4;
+	*r4 = ~*r4;   
+}
+
+static void S3f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
+{        
+	*r4 = *r0;
+	*r0 |= *r3;
+	*r3 ^= *r1;
+	*r1 &= *r4;
+	*r4 ^= *r2;
+	*r2 ^= *r3;
+	*r3 &= *r0;
+	*r4 |= *r1;
+	*r3 ^= *r4;
+	*r0 ^= *r1;
+	*r4 &= *r0;
+	*r1 ^= *r3;
+	*r4 ^= *r2;
+	*r1 |= *r0;
+	*r1 ^= *r2;
+	*r0 ^= *r3;
+	*r2 = *r1;
+	*r1 |= *r3;
+	*r1 ^= *r0;
+}
+
+static void S4f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
+{        
+	*r1 ^= *r3;
+	*r3 = ~*r3;   
+	*r2 ^= *r3;
+	*r3 ^= *r0;
+	*r4 = *r1;
+	*r1 &= *r3;
+	*r1 ^= *r2;
+	*r4 ^= *r3;
+	*r0 ^= *r4;
+	*r2 &= *r4;
+	*r2 ^= *r0;
+	*r0 &= *r1;
+	*r3 ^= *r0;
+	*r4 |= *r1;
+	*r4 ^= *r0;
+	*r0 |= *r3;
+	*r0 ^= *r2;
+	*r2 &= *r3;
+	*r0 = ~*r0;   
+	*r4 ^= *r2;
+}
+
+static void S5f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
+{        
+	*r0 ^= *r1;
+	*r1 ^= *r3;
+	*r3 = ~*r3;   
+	*r4 = *r1;
+	*r1 &= *r0;
+	*r2 ^= *r3;
+	*r1 ^= *r2;
+	*r2 |= *r4;
+	*r4 ^= *r3;
+	*r3 &= *r1;
+	*r3 ^= *r0;
+	*r4 ^= *r1;
+	*r4 ^= *r2;
+	*r2 ^= *r0;
+	*r0 &= *r3;
+	*r2 = ~*r2;   
+	*r0 ^= *r4;
+	*r4 |= *r3;
+	*r2 ^= *r4;
+}
+
+static void S6f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
+{        
+	*r2 = ~*r2;   
+	*r4 = *r3;
+	*r3 &= *r0;
+	*r0 ^= *r4;
+	*r3 ^= *r2;
+	*r2 |= *r4;
+	*r1 ^= *r3;
+	*r2 ^= *r0;
+	*r0 |= *r1;
+	*r2 ^= *r1;
+	*r4 ^= *r0;
+	*r0 |= *r3;
+	*r0 ^= *r2;
+	*r4 ^= *r3;
+	*r4 ^= *r0;
+	*r3 = ~*r3;   
+	*r2 &= *r4;
+	*r2 ^= *r3;
+}
+
+static void S7f (unsigned __int32 *r0, unsigned __int32 *r1, unsigned __int32 *r2, unsigned __int32 *r3, unsigned __int32 *r4)
+{        
+	*r4 = *r2;
+	*r2 &= *r1;
+	*r2 ^= *r3;
+	*r3 &= *r1;
+	*r4 ^= *r2;
+	*r2 ^= *r1;
+	*r1 ^= *r0;
+	*r0 |= *r4;
+	*r0 ^= *r2;
+	*r3 ^= *r1;
+	*r2 ^= *r3;
+	*r3 &= *r0;
+	*r3 ^= *r4;
+	*r4 ^= *r2;
+	*r2 &= *r0;
+	*r4 = ~*r4;   
+	*r2 ^= *r4;
+	*r4 &= *r0;
+	*r1 ^= *r3;
+	*r4 ^= *r1;
+}
+
+static void KXf (const unsigned __int32 *k, unsigned int r, unsigned __int32 *a, unsigned __int32 *b, unsigned __int32 *c, unsigned __int32 *d)
+{
+	*a ^= k[r];
+	*b ^= k[r + 1];
+	*c ^= k[r + 2];
+	*d ^= k[r + 3];
+}
+
+#endif // TC_MINIMIZE_CODE_SIZE
+
+#ifndef TC_MINIMIZE_CODE_SIZE
+
+void serpent_set_key(const unsigned __int8 userKey[],unsigned __int8 *ks)
+{
+	unsigned __int32 a,b,c,d,e;
+	unsigned __int32 *k = (unsigned __int32 *)ks;
+	unsigned __int32 t;
+	int i;
+
+	for (i = 0; i < 8; i++)
+		k[i] = LE32(((unsigned __int32*)userKey)[i]);
+
+	k += 8;
+	t = k[-1];
+	for (i = 0; i < 132; ++i)
+		k[i] = t = rotlFixed(k[i-8] ^ k[i-5] ^ k[i-3] ^ t ^ 0x9e3779b9 ^ i, 11);
+	k -= 20;
+
+#define LK(r, a, b, c, d, e)	{\
+	a = k[(8-r)*4 + 0];		\
+	b = k[(8-r)*4 + 1];		\
+	c = k[(8-r)*4 + 2];		\
+	d = k[(8-r)*4 + 3];}
+
+#define SK(r, a, b, c, d, e)	{\
+	k[(8-r)*4 + 4] = a;		\
+	k[(8-r)*4 + 5] = b;		\
+	k[(8-r)*4 + 6] = c;		\
+	k[(8-r)*4 + 7] = d;}	\
+
+	for (i=0; i<4; i++)
+	{
+		afterS2(LK); afterS2(S3); afterS3(SK);
+		afterS1(LK); afterS1(S2); afterS2(SK);
+		afterS0(LK); afterS0(S1); afterS1(SK);
+		beforeS0(LK); beforeS0(S0); afterS0(SK);
+		k += 8*4;
+		afterS6(LK); afterS6(S7); afterS7(SK);
+		afterS5(LK); afterS5(S6); afterS6(SK);
+		afterS4(LK); afterS4(S5); afterS5(SK);
+		afterS3(LK); afterS3(S4); afterS4(SK);
+	}
+	afterS2(LK); afterS2(S3); afterS3(SK);
+}
+
+#else // TC_MINIMIZE_CODE_SIZE
+
+static void LKf (unsigned __int32 *k, unsigned int r, unsigned __int32 *a, unsigned __int32 *b, unsigned __int32 *c, unsigned __int32 *d)
+{
+	*a = k[r];
+	*b = k[r + 1];
+	*c = k[r + 2];
+	*d = k[r + 3];
+}
+
+static void SKf (unsigned __int32 *k, unsigned int r, unsigned __int32 *a, unsigned __int32 *b, unsigned __int32 *c, unsigned __int32 *d)
+{
+	k[r + 4] = *a;
+	k[r + 5] = *b;
+	k[r + 6] = *c;
+	k[r + 7] = *d;
+}
+
+void serpent_set_key(const unsigned __int8 userKey[], unsigned __int8 *ks)
+{
+	unsigned __int32 a,b,c,d,e;
+	unsigned __int32 *k = (unsigned __int32 *)ks;
+	unsigned __int32 t;
+	int i;
+
+	for (i = 0; i < 8; i++)
+		k[i] = LE32(((unsigned __int32*)userKey)[i]);
+
+	k += 8;
+	t = k[-1];
+	for (i = 0; i < 132; ++i)
+		k[i] = t = rotlFixed(k[i-8] ^ k[i-5] ^ k[i-3] ^ t ^ 0x9e3779b9 ^ i, 11);
+	k -= 20;
+
+	for (i=0; i<4; i++)
+	{
+		LKf (k, 20, &a, &e, &b, &d); S3f (&a, &e, &b, &d, &c); SKf (k, 16, &e, &b, &d, &c);
+		LKf (k, 24, &c, &b, &a, &e); S2f (&c, &b, &a, &e, &d); SKf (k, 20, &a, &e, &b, &d);
+		LKf (k, 28, &b, &e, &c, &a); S1f (&b, &e, &c, &a, &d); SKf (k, 24, &c, &b, &a, &e);
+		LKf (k, 32, &a, &b, &c, &d); S0f (&a, &b, &c, &d, &e); SKf (k, 28, &b, &e, &c, &a);
+		k += 8*4;
+		LKf (k,  4, &a, &c, &d, &b); S7f (&a, &c, &d, &b, &e); SKf (k,  0, &d, &e, &b, &a);
+		LKf (k,  8, &a, &c, &b, &e); S6f (&a, &c, &b, &e, &d); SKf (k,  4, &a, &c, &d, &b);
+		LKf (k, 12, &b, &a, &e, &c); S5f (&b, &a, &e, &c, &d); SKf (k,  8, &a, &c, &b, &e);
+		LKf (k, 16, &e, &b, &d, &c); S4f (&e, &b, &d, &c, &a); SKf (k, 12, &b, &a, &e, &c);
+	}
+	LKf (k, 20, &a, &e, &b, &d); S3f (&a, &e, &b, &d, &c); SKf (k, 16, &e, &b, &d, &c);
+}
+
+#endif // TC_MINIMIZE_CODE_SIZE
+
+
+#ifndef TC_MINIMIZE_CODE_SIZE
+
+void serpent_encrypt(const unsigned __int8 *inBlock, unsigned __int8 *outBlock, unsigned __int8 *ks)
+{
+	unsigned __int32 a, b, c, d, e;
+	unsigned int i=1;
+	const unsigned __int32 *k = (unsigned __int32 *)ks + 8;
+	unsigned __int32 *in = (unsigned __int32 *) inBlock;
+	unsigned __int32 *out = (unsigned __int32 *) outBlock;
+
+    a = LE32(in[0]);
+	b = LE32(in[1]);
+	c = LE32(in[2]);
+	d = LE32(in[3]);
+
+	do
+	{
+		beforeS0(KX); beforeS0(S0); afterS0(LT);
+		afterS0(KX); afterS0(S1); afterS1(LT);
+		afterS1(KX); afterS1(S2); afterS2(LT);
+		afterS2(KX); afterS2(S3); afterS3(LT);
+		afterS3(KX); afterS3(S4); afterS4(LT);
+		afterS4(KX); afterS4(S5); afterS5(LT);
+		afterS5(KX); afterS5(S6); afterS6(LT);
+		afterS6(KX); afterS6(S7);
+
+		if (i == 4)
+			break;
+
+		++i;
+		c = b;
+		b = e;
+		e = d;
+		d = a;
+		a = e;
+		k += 32;
+		beforeS0(LT);
+	}
+	while (1);
+
+	afterS7(KX);
+	
+    out[0] = LE32(d);
+	out[1] = LE32(e);
+	out[2] = LE32(b);
+	out[3] = LE32(a);
+}
+
+#else // TC_MINIMIZE_CODE_SIZE
+
+typedef unsigned __int32 uint32;
+
+static void LTf (uint32 *a, uint32 *b, uint32 *c, uint32 *d)
+{
+	*a = rotlFixed(*a, 13);
+	*c = rotlFixed(*c, 3);
+	*d = rotlFixed(*d ^ *c ^ (*a << 3), 7);
+	*b = rotlFixed(*b ^ *a ^ *c, 1);
+	*a = rotlFixed(*a ^ *b ^ *d, 5);
+	*c = rotlFixed(*c ^ *d ^ (*b << 7), 22);
+}
+
+void serpent_encrypt(const unsigned __int8 *inBlock, unsigned __int8 *outBlock, unsigned __int8 *ks)
+{
+	unsigned __int32 a, b, c, d, e;
+	unsigned int i=1;
+	const unsigned __int32 *k = (unsigned __int32 *)ks + 8;
+	unsigned __int32 *in = (unsigned __int32 *) inBlock;
+	unsigned __int32 *out = (unsigned __int32 *) outBlock;
+
+    a = LE32(in[0]);
+	b = LE32(in[1]);
+	c = LE32(in[2]);
+	d = LE32(in[3]);
+
+	do
+	{
+		KXf (k,  0, &a, &b, &c, &d); S0f (&a, &b, &c, &d, &e); LTf (&b, &e, &c, &a);
+		KXf (k,  4, &b, &e, &c, &a); S1f (&b, &e, &c, &a, &d); LTf (&c, &b, &a, &e);
+		KXf (k,  8, &c, &b, &a, &e); S2f (&c, &b, &a, &e, &d); LTf (&a, &e, &b, &d);
+		KXf (k, 12, &a, &e, &b, &d); S3f (&a, &e, &b, &d, &c); LTf (&e, &b, &d, &c);
+		KXf (k, 16, &e, &b, &d, &c); S4f (&e, &b, &d, &c, &a); LTf (&b, &a, &e, &c);
+		KXf (k, 20, &b, &a, &e, &c); S5f (&b, &a, &e, &c, &d); LTf (&a, &c, &b, &e);
+		KXf (k, 24, &a, &c, &b, &e); S6f (&a, &c, &b, &e, &d); LTf (&a, &c, &d, &b);
+		KXf (k, 28, &a, &c, &d, &b); S7f (&a, &c, &d, &b, &e);
+
+		if (i == 4)
+			break;
+
+		++i;
+		c = b;
+		b = e;
+		e = d;
+		d = a;
+		a = e;
+		k += 32;
+		LTf (&a,&b,&c,&d);
+	}
+	while (1);
+
+	KXf (k, 32, &d, &e, &b, &a);
+	
+    out[0] = LE32(d);
+	out[1] = LE32(e);
+	out[2] = LE32(b);
+	out[3] = LE32(a);
+}
+
+#endif // TC_MINIMIZE_CODE_SIZE
+
+#if !defined (TC_MINIMIZE_CODE_SIZE)
+
+void serpent_decrypt(const unsigned __int8 *inBlock, unsigned __int8 *outBlock, unsigned __int8 *ks)
+{
+	unsigned __int32 a, b, c, d, e;
+	const unsigned __int32 *k = (unsigned __int32 *)ks + 104;
+	unsigned int i=4;
+	unsigned __int32 *in = (unsigned __int32 *) inBlock;
+	unsigned __int32 *out = (unsigned __int32 *) outBlock;
+
+    a = LE32(in[0]);
+	b = LE32(in[1]);
+	c = LE32(in[2]);
+	d = LE32(in[3]);
+
+	beforeI7(KX);
+	goto start;
+
+	do
+	{
+		c = b;
+		b = d;
+		d = e;
+		k -= 32;
+		beforeI7(ILT);
+start:
+		beforeI7(I7); afterI7(KX); 
+		afterI7(ILT); afterI7(I6); afterI6(KX); 
+		afterI6(ILT); afterI6(I5); afterI5(KX); 
+		afterI5(ILT); afterI5(I4); afterI4(KX); 
+		afterI4(ILT); afterI4(I3); afterI3(KX); 
+		afterI3(ILT); afterI3(I2); afterI2(KX); 
+		afterI2(ILT); afterI2(I1); afterI1(KX); 
+		afterI1(ILT); afterI1(I0); afterI0(KX);
+	}
+	while (--i != 0);
+	
+    out[0] = LE32(a);
+	out[1] = LE32(d);
+	out[2] = LE32(b);
+	out[3] = LE32(e);
+}
+
+#else // TC_MINIMIZE_CODE_SIZE
+
+static void ILTf (uint32 *a, uint32 *b, uint32 *c, uint32 *d)
+{ 
+	*c = rotrFixed(*c, 22);
+	*a = rotrFixed(*a, 5);
+	*c ^= *d ^ (*b << 7);
+	*a ^= *b ^ *d;
+	*b = rotrFixed(*b, 1);
+	*d = rotrFixed(*d, 7) ^ *c ^ (*a << 3);
+	*b ^= *a ^ *c;
+	*c = rotrFixed(*c, 3);
+	*a = rotrFixed(*a, 13);
+}
+
+void serpent_decrypt(const unsigned __int8 *inBlock, unsigned __int8 *outBlock, unsigned __int8 *ks)
+{
+	unsigned __int32 a, b, c, d, e;
+	const unsigned __int32 *k = (unsigned __int32 *)ks + 104;
+	unsigned int i=4;
+	unsigned __int32 *in = (unsigned __int32 *) inBlock;
+	unsigned __int32 *out = (unsigned __int32 *) outBlock;
+
+    a = LE32(in[0]);
+	b = LE32(in[1]);
+	c = LE32(in[2]);
+	d = LE32(in[3]);
+
+	KXf (k, 32, &a, &b, &c, &d);
+	goto start;
+
+	do
+	{
+		c = b;
+		b = d;
+		d = e;
+		k -= 32;
+		beforeI7(ILT);
+start:
+		beforeI7(I7); KXf (k, 28, &d, &a, &b, &e);
+		ILTf (&d, &a, &b, &e); afterI7(I6); KXf (k, 24, &a, &b, &c, &e); 
+		ILTf (&a, &b, &c, &e); afterI6(I5); KXf (k, 20, &b, &d, &e, &c); 
+		ILTf (&b, &d, &e, &c); afterI5(I4); KXf (k, 16, &b, &c, &e, &a); 
+		ILTf (&b, &c, &e, &a); afterI4(I3); KXf (k, 12, &a, &b, &e, &c);
+		ILTf (&a, &b, &e, &c); afterI3(I2); KXf (k, 8,  &b, &d, &e, &c);
+		ILTf (&b, &d, &e, &c); afterI2(I1); KXf (k, 4,  &a, &b, &c, &e);
+		ILTf (&a, &b, &c, &e); afterI1(I0); KXf (k, 0,  &a, &d, &b, &e);
+	}
+	while (--i != 0);
+	
+    out[0] = LE32(a);
+	out[1] = LE32(d);
+	out[2] = LE32(b);
+	out[3] = LE32(e);
+}
+
+#endif // TC_MINIMIZE_CODE_SIZE
diff --git a/src/Crypto/Serpent.h b/src/Crypto/Serpent.h
index b88ddc4d..0f4ab787 100644
--- a/src/Crypto/Serpent.h
+++ b/src/Crypto/Serpent.h
@@ -1,20 +1,20 @@
-#ifndef HEADER_Crypto_Serpent
-#define HEADER_Crypto_Serpent
-
-#include "Common/Tcdefs.h"
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-/* userKey is always 32-bytes long */
-void serpent_set_key(const unsigned __int8 userKey[], unsigned __int8 *ks);
-void serpent_encrypt(const unsigned __int8 *inBlock, unsigned __int8 *outBlock, unsigned __int8 *ks);
-void serpent_decrypt(const unsigned __int8 *inBlock,  unsigned __int8 *outBlock, unsigned __int8 *ks);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // HEADER_Crypto_Serpent
+#ifndef HEADER_Crypto_Serpent
+#define HEADER_Crypto_Serpent
+
+#include "Common/Tcdefs.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* userKey is always 32-bytes long */
+void serpent_set_key(const unsigned __int8 userKey[], unsigned __int8 *ks);
+void serpent_encrypt(const unsigned __int8 *inBlock, unsigned __int8 *outBlock, unsigned __int8 *ks);
+void serpent_decrypt(const unsigned __int8 *inBlock,  unsigned __int8 *outBlock, unsigned __int8 *ks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // HEADER_Crypto_Serpent
diff --git a/src/Crypto/Sha2.c b/src/Crypto/Sha2.c
index f1a9850a..02680eb5 100644
--- a/src/Crypto/Sha2.c
+++ b/src/Crypto/Sha2.c
@@ -1,753 +1,753 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 01/08/2005
-
- This is a byte oriented version of SHA2 that operates on arrays of bytes
- stored in memory. This code implements sha256, sha384 and sha512 but the
- latter two functions rely on efficient 64-bit integer operations that
- may not be very efficient on 32-bit machines
-
- The sha256 functions use a type 'sha256_ctx' to hold details of the
- current hash state and uses the following three calls:
-
-       void sha256_begin(sha256_ctx ctx[1])
-       void sha256_hash(const unsigned char data[],
-                            unsigned long len, sha256_ctx ctx[1])
-       void sha_end1(unsigned char hval[], sha256_ctx ctx[1])
-
- The first subroutine initialises a hash computation by setting up the
- context in the sha256_ctx context. The second subroutine hashes 8-bit
- bytes from array data[] into the hash state withinh sha256_ctx context,
- the number of bytes to be hashed being given by the the unsigned long
- integer len.  The third subroutine completes the hash calculation and
- places the resulting digest value in the array of 8-bit bytes hval[].
-
- The sha384 and sha512 functions are similar and use the interfaces:
-
-       void sha384_begin(sha384_ctx ctx[1]);
-       void sha384_hash(const unsigned char data[],
-                            unsigned long len, sha384_ctx ctx[1]);
-       void sha384_end(unsigned char hval[], sha384_ctx ctx[1]);
-
-       void sha512_begin(sha512_ctx ctx[1]);
-       void sha512_hash(const unsigned char data[],
-                            unsigned long len, sha512_ctx ctx[1]);
-       void sha512_end(unsigned char hval[], sha512_ctx ctx[1]);
-
- In addition there is a function sha2 that can be used to call all these
- functions using a call with a hash length parameter as follows:
-
-       int sha2_begin(unsigned long len, sha2_ctx ctx[1]);
-       void sha2_hash(const unsigned char data[],
-                            unsigned long len, sha2_ctx ctx[1]);
-       void sha2_end(unsigned char hval[], sha2_ctx ctx[1]);
-
- My thanks to Erik Andersen <andersen@codepoet.org> for testing this code
- on big-endian systems and for his assistance with corrections
-*/
-
-#include "Common/Endian.h"
-#include "Crypto/misc.h"
-#define PLATFORM_BYTE_ORDER BYTE_ORDER
-#define IS_LITTLE_ENDIAN LITTLE_ENDIAN
-
-#if 0
-#define UNROLL_SHA2     /* for SHA2 loop unroll     */
-#endif
-
-#include <string.h>     /* for memcpy() etc.        */
-
-#include "Sha2.h"
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-#if defined( _MSC_VER ) && ( _MSC_VER > 800 )
-#pragma intrinsic(memcpy)
-#endif
-
-#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-#define SWAP_BYTES
-#else
-#undef  SWAP_BYTES
-#endif
-
-#if 0
-
-#define ch(x,y,z)       (((x) & (y)) ^ (~(x) & (z)))
-#define maj(x,y,z)      (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
-
-#else   /* Thanks to Rich Schroeppel and Colin Plumb for the following      */
-
-#define ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
-#define maj(x,y,z)      (((x) & (y)) | ((z) & ((x) ^ (y))))
-
-#endif
-
-/* round transforms for SHA256 and SHA512 compression functions */
-
-#define vf(n,i) v[(n - i) & 7]
-
-#define hf(i) (p[i & 15] += \
-    g_1(p[(i + 14) & 15]) + p[(i + 9) & 15] + g_0(p[(i + 1) & 15]))
-
-#define v_cycle(i,j)                                \
-    vf(7,i) += (j ? hf(i) : p[i]) + k_0[i+j]        \
-    + s_1(vf(4,i)) + ch(vf(4,i),vf(5,i),vf(6,i));   \
-    vf(3,i) += vf(7,i);                             \
-    vf(7,i) += s_0(vf(0,i))+ maj(vf(0,i),vf(1,i),vf(2,i))
-
-#if defined(SHA_224) || defined(SHA_256)
-
-#define SHA256_MASK (SHA256_BLOCK_SIZE - 1)
-
-#if defined(SWAP_BYTES)
-#define bsw_32(p,n) \
-    { int _i = (n); while(_i--) ((uint_32t*)p)[_i] = bswap_32(((uint_32t*)p)[_i]); }
-#else
-#define bsw_32(p,n)
-#endif
-
-#define s_0(x)  (rotr32((x),  2) ^ rotr32((x), 13) ^ rotr32((x), 22))
-#define s_1(x)  (rotr32((x),  6) ^ rotr32((x), 11) ^ rotr32((x), 25))
-#define g_0(x)  (rotr32((x),  7) ^ rotr32((x), 18) ^ ((x) >>  3))
-#define g_1(x)  (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10))
-#define k_0     k256
-
-/* rotated SHA256 round definition. Rather than swapping variables as in    */
-/* FIPS-180, different variables are 'rotated' on each round, returning     */
-/* to their starting positions every eight rounds                           */
-
-#define q(n)  v##n
-
-#define one_cycle(a,b,c,d,e,f,g,h,k,w)  \
-    q(h) += s_1(q(e)) + ch(q(e), q(f), q(g)) + k + w; \
-    q(d) += q(h); q(h) += s_0(q(a)) + maj(q(a), q(b), q(c))
-
-/* SHA256 mixing data   */
-
-const uint_32t k256[64] =
-{   0x428a2f98ul, 0x71374491ul, 0xb5c0fbcful, 0xe9b5dba5ul,
-    0x3956c25bul, 0x59f111f1ul, 0x923f82a4ul, 0xab1c5ed5ul,
-    0xd807aa98ul, 0x12835b01ul, 0x243185beul, 0x550c7dc3ul,
-    0x72be5d74ul, 0x80deb1feul, 0x9bdc06a7ul, 0xc19bf174ul,
-    0xe49b69c1ul, 0xefbe4786ul, 0x0fc19dc6ul, 0x240ca1ccul,
-    0x2de92c6ful, 0x4a7484aaul, 0x5cb0a9dcul, 0x76f988daul,
-    0x983e5152ul, 0xa831c66dul, 0xb00327c8ul, 0xbf597fc7ul,
-    0xc6e00bf3ul, 0xd5a79147ul, 0x06ca6351ul, 0x14292967ul,
-    0x27b70a85ul, 0x2e1b2138ul, 0x4d2c6dfcul, 0x53380d13ul,
-    0x650a7354ul, 0x766a0abbul, 0x81c2c92eul, 0x92722c85ul,
-    0xa2bfe8a1ul, 0xa81a664bul, 0xc24b8b70ul, 0xc76c51a3ul,
-    0xd192e819ul, 0xd6990624ul, 0xf40e3585ul, 0x106aa070ul,
-    0x19a4c116ul, 0x1e376c08ul, 0x2748774cul, 0x34b0bcb5ul,
-    0x391c0cb3ul, 0x4ed8aa4aul, 0x5b9cca4ful, 0x682e6ff3ul,
-    0x748f82eeul, 0x78a5636ful, 0x84c87814ul, 0x8cc70208ul,
-    0x90befffaul, 0xa4506cebul, 0xbef9a3f7ul, 0xc67178f2ul,
-};
-
-/* Compile 64 bytes of hash data into SHA256 digest value   */
-/* NOTE: this routine assumes that the byte order in the    */
-/* ctx->wbuf[] at this point is such that low address bytes */
-/* in the ORIGINAL byte stream will go into the high end of */
-/* words on BOTH big and little endian systems              */
-
-VOID_RETURN sha256_compile(sha256_ctx ctx[1])
-{
-#if !defined(UNROLL_SHA2)
-
-    uint_32t j, *p = ctx->wbuf, v[8];
-
-    memcpy(v, ctx->hash, 8 * sizeof(uint_32t));
-
-    for(j = 0; j < 64; j += 16)
-    {
-        v_cycle( 0, j); v_cycle( 1, j);
-        v_cycle( 2, j); v_cycle( 3, j);
-        v_cycle( 4, j); v_cycle( 5, j);
-        v_cycle( 6, j); v_cycle( 7, j);
-        v_cycle( 8, j); v_cycle( 9, j);
-        v_cycle(10, j); v_cycle(11, j);
-        v_cycle(12, j); v_cycle(13, j);
-        v_cycle(14, j); v_cycle(15, j);
-    }
-
-    ctx->hash[0] += v[0]; ctx->hash[1] += v[1];
-    ctx->hash[2] += v[2]; ctx->hash[3] += v[3];
-    ctx->hash[4] += v[4]; ctx->hash[5] += v[5];
-    ctx->hash[6] += v[6]; ctx->hash[7] += v[7];
-
-#else
-
-    uint_32t *p = ctx->wbuf,v0,v1,v2,v3,v4,v5,v6,v7;
-
-    v0 = ctx->hash[0]; v1 = ctx->hash[1];
-    v2 = ctx->hash[2]; v3 = ctx->hash[3];
-    v4 = ctx->hash[4]; v5 = ctx->hash[5];
-    v6 = ctx->hash[6]; v7 = ctx->hash[7];
-
-    one_cycle(0,1,2,3,4,5,6,7,k256[ 0],p[ 0]);
-    one_cycle(7,0,1,2,3,4,5,6,k256[ 1],p[ 1]);
-    one_cycle(6,7,0,1,2,3,4,5,k256[ 2],p[ 2]);
-    one_cycle(5,6,7,0,1,2,3,4,k256[ 3],p[ 3]);
-    one_cycle(4,5,6,7,0,1,2,3,k256[ 4],p[ 4]);
-    one_cycle(3,4,5,6,7,0,1,2,k256[ 5],p[ 5]);
-    one_cycle(2,3,4,5,6,7,0,1,k256[ 6],p[ 6]);
-    one_cycle(1,2,3,4,5,6,7,0,k256[ 7],p[ 7]);
-    one_cycle(0,1,2,3,4,5,6,7,k256[ 8],p[ 8]);
-    one_cycle(7,0,1,2,3,4,5,6,k256[ 9],p[ 9]);
-    one_cycle(6,7,0,1,2,3,4,5,k256[10],p[10]);
-    one_cycle(5,6,7,0,1,2,3,4,k256[11],p[11]);
-    one_cycle(4,5,6,7,0,1,2,3,k256[12],p[12]);
-    one_cycle(3,4,5,6,7,0,1,2,k256[13],p[13]);
-    one_cycle(2,3,4,5,6,7,0,1,k256[14],p[14]);
-    one_cycle(1,2,3,4,5,6,7,0,k256[15],p[15]);
-
-    one_cycle(0,1,2,3,4,5,6,7,k256[16],hf( 0));
-    one_cycle(7,0,1,2,3,4,5,6,k256[17],hf( 1));
-    one_cycle(6,7,0,1,2,3,4,5,k256[18],hf( 2));
-    one_cycle(5,6,7,0,1,2,3,4,k256[19],hf( 3));
-    one_cycle(4,5,6,7,0,1,2,3,k256[20],hf( 4));
-    one_cycle(3,4,5,6,7,0,1,2,k256[21],hf( 5));
-    one_cycle(2,3,4,5,6,7,0,1,k256[22],hf( 6));
-    one_cycle(1,2,3,4,5,6,7,0,k256[23],hf( 7));
-    one_cycle(0,1,2,3,4,5,6,7,k256[24],hf( 8));
-    one_cycle(7,0,1,2,3,4,5,6,k256[25],hf( 9));
-    one_cycle(6,7,0,1,2,3,4,5,k256[26],hf(10));
-    one_cycle(5,6,7,0,1,2,3,4,k256[27],hf(11));
-    one_cycle(4,5,6,7,0,1,2,3,k256[28],hf(12));
-    one_cycle(3,4,5,6,7,0,1,2,k256[29],hf(13));
-    one_cycle(2,3,4,5,6,7,0,1,k256[30],hf(14));
-    one_cycle(1,2,3,4,5,6,7,0,k256[31],hf(15));
-
-    one_cycle(0,1,2,3,4,5,6,7,k256[32],hf( 0));
-    one_cycle(7,0,1,2,3,4,5,6,k256[33],hf( 1));
-    one_cycle(6,7,0,1,2,3,4,5,k256[34],hf( 2));
-    one_cycle(5,6,7,0,1,2,3,4,k256[35],hf( 3));
-    one_cycle(4,5,6,7,0,1,2,3,k256[36],hf( 4));
-    one_cycle(3,4,5,6,7,0,1,2,k256[37],hf( 5));
-    one_cycle(2,3,4,5,6,7,0,1,k256[38],hf( 6));
-    one_cycle(1,2,3,4,5,6,7,0,k256[39],hf( 7));
-    one_cycle(0,1,2,3,4,5,6,7,k256[40],hf( 8));
-    one_cycle(7,0,1,2,3,4,5,6,k256[41],hf( 9));
-    one_cycle(6,7,0,1,2,3,4,5,k256[42],hf(10));
-    one_cycle(5,6,7,0,1,2,3,4,k256[43],hf(11));
-    one_cycle(4,5,6,7,0,1,2,3,k256[44],hf(12));
-    one_cycle(3,4,5,6,7,0,1,2,k256[45],hf(13));
-    one_cycle(2,3,4,5,6,7,0,1,k256[46],hf(14));
-    one_cycle(1,2,3,4,5,6,7,0,k256[47],hf(15));
-
-    one_cycle(0,1,2,3,4,5,6,7,k256[48],hf( 0));
-    one_cycle(7,0,1,2,3,4,5,6,k256[49],hf( 1));
-    one_cycle(6,7,0,1,2,3,4,5,k256[50],hf( 2));
-    one_cycle(5,6,7,0,1,2,3,4,k256[51],hf( 3));
-    one_cycle(4,5,6,7,0,1,2,3,k256[52],hf( 4));
-    one_cycle(3,4,5,6,7,0,1,2,k256[53],hf( 5));
-    one_cycle(2,3,4,5,6,7,0,1,k256[54],hf( 6));
-    one_cycle(1,2,3,4,5,6,7,0,k256[55],hf( 7));
-    one_cycle(0,1,2,3,4,5,6,7,k256[56],hf( 8));
-    one_cycle(7,0,1,2,3,4,5,6,k256[57],hf( 9));
-    one_cycle(6,7,0,1,2,3,4,5,k256[58],hf(10));
-    one_cycle(5,6,7,0,1,2,3,4,k256[59],hf(11));
-    one_cycle(4,5,6,7,0,1,2,3,k256[60],hf(12));
-    one_cycle(3,4,5,6,7,0,1,2,k256[61],hf(13));
-    one_cycle(2,3,4,5,6,7,0,1,k256[62],hf(14));
-    one_cycle(1,2,3,4,5,6,7,0,k256[63],hf(15));
-
-    ctx->hash[0] += v0; ctx->hash[1] += v1;
-    ctx->hash[2] += v2; ctx->hash[3] += v3;
-    ctx->hash[4] += v4; ctx->hash[5] += v5;
-    ctx->hash[6] += v6; ctx->hash[7] += v7;
-#endif
-}
-
-/* SHA256 hash data in an array of bytes into hash buffer   */
-/* and call the hash_compile function as required.          */
-
-VOID_RETURN sha256_hash(const unsigned char data[], unsigned long len, sha256_ctx ctx[1])
-{   uint_32t pos = (uint_32t)(ctx->count[0] & SHA256_MASK),
-             space = SHA256_BLOCK_SIZE - pos;
-    const unsigned char *sp = data;
-
-    if((ctx->count[0] += len) < len)
-        ++(ctx->count[1]);
-
-    while(len >= space)     /* tranfer whole blocks while possible  */
-    {
-        memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space);
-        sp += space; len -= space; space = SHA256_BLOCK_SIZE; pos = 0;
-        bsw_32(ctx->wbuf, SHA256_BLOCK_SIZE >> 2)
-        sha256_compile(ctx);
-    }
-
-    memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len);
-}
-
-/* SHA256 Final padding and digest calculation  */
-
-static void sha_end1(unsigned char hval[], sha256_ctx ctx[1], const unsigned int hlen)
-{   uint_32t    i = (uint_32t)(ctx->count[0] & SHA256_MASK);
-
-    /* put bytes in the buffer in an order in which references to   */
-    /* 32-bit words will put bytes with lower addresses into the    */
-    /* top of 32 bit words on BOTH big and little endian machines   */
-    bsw_32(ctx->wbuf, (i + 3) >> 2)
-
-    /* we now need to mask valid bytes and add the padding which is */
-    /* a single 1 bit and as many zero bits as necessary. Note that */
-    /* we can always add the first padding byte here because the    */
-    /* buffer always has at least one empty slot                    */
-    ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3);
-    ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3);
-
-    /* we need 9 or more empty positions, one for the padding byte  */
-    /* (above) and eight for the length count.  If there is not     */
-    /* enough space pad and empty the buffer                        */
-    if(i > SHA256_BLOCK_SIZE - 9)
-    {
-        if(i < 60) ctx->wbuf[15] = 0;
-        sha256_compile(ctx);
-        i = 0;
-    }
-    else    /* compute a word index for the empty buffer positions  */
-        i = (i >> 2) + 1;
-
-    while(i < 14) /* and zero pad all but last two positions        */
-        ctx->wbuf[i++] = 0;
-
-    /* the following 32-bit length fields are assembled in the      */
-    /* wrong byte order on little endian machines but this is       */
-    /* corrected later since they are only ever used as 32-bit      */
-    /* word values.                                                 */
-    ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29);
-    ctx->wbuf[15] = ctx->count[0] << 3;
-    sha256_compile(ctx);
-
-    /* extract the hash value as bytes in case the hash buffer is   */
-    /* mislaigned for 32-bit words                                  */
-    for(i = 0; i < hlen; ++i)
-        hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3)));
-}
-
-#endif
-
-#if defined(SHA_224)
-
-const uint_32t i224[8] =
-{
-    0xc1059ed8ul, 0x367cd507ul, 0x3070dd17ul, 0xf70e5939ul,
-    0xffc00b31ul, 0x68581511ul, 0x64f98fa7ul, 0xbefa4fa4ul
-};
-
-VOID_RETURN sha224_begin(sha224_ctx ctx[1])
-{
-    ctx->count[0] = ctx->count[1] = 0;
-    memcpy(ctx->hash, i224, 8 * sizeof(uint_32t));
-}
-
-VOID_RETURN sha224_end(unsigned char hval[], sha224_ctx ctx[1])
-{
-    sha_end1(hval, ctx, SHA224_DIGEST_SIZE);
-}
-
-VOID_RETURN sha224(unsigned char hval[], const unsigned char data[], unsigned long len)
-{   sha224_ctx  cx[1];
-
-    sha224_begin(cx);
-    sha224_hash(data, len, cx);
-    sha_end1(hval, cx, SHA224_DIGEST_SIZE);
-}
-
-#endif
-
-#if defined(SHA_256)
-
-const uint_32t i256[8] =
-{
-    0x6a09e667ul, 0xbb67ae85ul, 0x3c6ef372ul, 0xa54ff53aul,
-    0x510e527ful, 0x9b05688cul, 0x1f83d9abul, 0x5be0cd19ul
-};
-
-VOID_RETURN sha256_begin(sha256_ctx ctx[1])
-{
-    ctx->count[0] = ctx->count[1] = 0;
-    memcpy(ctx->hash, i256, 8 * sizeof(uint_32t));
-}
-
-VOID_RETURN sha256_end(unsigned char hval[], sha256_ctx ctx[1])
-{
-    sha_end1(hval, ctx, SHA256_DIGEST_SIZE);
-}
-
-VOID_RETURN sha256(unsigned char hval[], const unsigned char data[], unsigned long len)
-{   sha256_ctx  cx[1];
-
-    sha256_begin(cx);
-    sha256_hash(data, len, cx);
-    sha_end1(hval, cx, SHA256_DIGEST_SIZE);
-}
-
-#endif
-
-#if defined(SHA_384) || defined(SHA_512)
-
-#define SHA512_MASK (SHA512_BLOCK_SIZE - 1)
-
-#if defined(SWAP_BYTES)
-#define bsw_64(p,n) \
-    { int _i = (n); while(_i--) ((uint_64t*)p)[_i] = bswap_64(((uint_64t*)p)[_i]); }
-#else
-#define bsw_64(p,n)
-#endif
-
-/* SHA512 mixing function definitions   */
-
-#ifdef   s_0
-# undef  s_0
-# undef  s_1
-# undef  g_0
-# undef  g_1
-# undef  k_0
-#endif
-
-#define s_0(x)  (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39))
-#define s_1(x)  (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41))
-#define g_0(x)  (rotr64((x),  1) ^ rotr64((x),  8) ^ ((x) >>  7))
-#define g_1(x)  (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >>  6))
-#define k_0     k512
-
-/* SHA384/SHA512 mixing data    */
-
-const uint_64t  k512[80] =
-{
-    li_64(428a2f98d728ae22), li_64(7137449123ef65cd),
-    li_64(b5c0fbcfec4d3b2f), li_64(e9b5dba58189dbbc),
-    li_64(3956c25bf348b538), li_64(59f111f1b605d019),
-    li_64(923f82a4af194f9b), li_64(ab1c5ed5da6d8118),
-    li_64(d807aa98a3030242), li_64(12835b0145706fbe),
-    li_64(243185be4ee4b28c), li_64(550c7dc3d5ffb4e2),
-    li_64(72be5d74f27b896f), li_64(80deb1fe3b1696b1),
-    li_64(9bdc06a725c71235), li_64(c19bf174cf692694),
-    li_64(e49b69c19ef14ad2), li_64(efbe4786384f25e3),
-    li_64(0fc19dc68b8cd5b5), li_64(240ca1cc77ac9c65),
-    li_64(2de92c6f592b0275), li_64(4a7484aa6ea6e483),
-    li_64(5cb0a9dcbd41fbd4), li_64(76f988da831153b5),
-    li_64(983e5152ee66dfab), li_64(a831c66d2db43210),
-    li_64(b00327c898fb213f), li_64(bf597fc7beef0ee4),
-    li_64(c6e00bf33da88fc2), li_64(d5a79147930aa725),
-    li_64(06ca6351e003826f), li_64(142929670a0e6e70),
-    li_64(27b70a8546d22ffc), li_64(2e1b21385c26c926),
-    li_64(4d2c6dfc5ac42aed), li_64(53380d139d95b3df),
-    li_64(650a73548baf63de), li_64(766a0abb3c77b2a8),
-    li_64(81c2c92e47edaee6), li_64(92722c851482353b),
-    li_64(a2bfe8a14cf10364), li_64(a81a664bbc423001),
-    li_64(c24b8b70d0f89791), li_64(c76c51a30654be30),
-    li_64(d192e819d6ef5218), li_64(d69906245565a910),
-    li_64(f40e35855771202a), li_64(106aa07032bbd1b8),
-    li_64(19a4c116b8d2d0c8), li_64(1e376c085141ab53),
-    li_64(2748774cdf8eeb99), li_64(34b0bcb5e19b48a8),
-    li_64(391c0cb3c5c95a63), li_64(4ed8aa4ae3418acb),
-    li_64(5b9cca4f7763e373), li_64(682e6ff3d6b2b8a3),
-    li_64(748f82ee5defb2fc), li_64(78a5636f43172f60),
-    li_64(84c87814a1f0ab72), li_64(8cc702081a6439ec),
-    li_64(90befffa23631e28), li_64(a4506cebde82bde9),
-    li_64(bef9a3f7b2c67915), li_64(c67178f2e372532b),
-    li_64(ca273eceea26619c), li_64(d186b8c721c0c207),
-    li_64(eada7dd6cde0eb1e), li_64(f57d4f7fee6ed178),
-    li_64(06f067aa72176fba), li_64(0a637dc5a2c898a6),
-    li_64(113f9804bef90dae), li_64(1b710b35131c471b),
-    li_64(28db77f523047d84), li_64(32caab7b40c72493),
-    li_64(3c9ebe0a15c9bebc), li_64(431d67c49c100d4c),
-    li_64(4cc5d4becb3e42b6), li_64(597f299cfc657e2a),
-    li_64(5fcb6fab3ad6faec), li_64(6c44198c4a475817)
-};
-
-/* Compile 128 bytes of hash data into SHA384/512 digest    */
-/* NOTE: this routine assumes that the byte order in the    */
-/* ctx->wbuf[] at this point is such that low address bytes */
-/* in the ORIGINAL byte stream will go into the high end of */
-/* words on BOTH big and little endian systems              */
-
-VOID_RETURN sha512_compile(sha512_ctx ctx[1])
-{   uint_64t    v[8], *p = ctx->wbuf;
-    uint_32t    j;
-
-    memcpy(v, ctx->hash, 8 * sizeof(uint_64t));
-
-    for(j = 0; j < 80; j += 16)
-    {
-        v_cycle( 0, j); v_cycle( 1, j);
-        v_cycle( 2, j); v_cycle( 3, j);
-        v_cycle( 4, j); v_cycle( 5, j);
-        v_cycle( 6, j); v_cycle( 7, j);
-        v_cycle( 8, j); v_cycle( 9, j);
-        v_cycle(10, j); v_cycle(11, j);
-        v_cycle(12, j); v_cycle(13, j);
-        v_cycle(14, j); v_cycle(15, j);
-    }
-
-    ctx->hash[0] += v[0]; ctx->hash[1] += v[1];
-    ctx->hash[2] += v[2]; ctx->hash[3] += v[3];
-    ctx->hash[4] += v[4]; ctx->hash[5] += v[5];
-    ctx->hash[6] += v[6]; ctx->hash[7] += v[7];
-}
-
-/* Compile 128 bytes of hash data into SHA256 digest value  */
-/* NOTE: this routine assumes that the byte order in the    */
-/* ctx->wbuf[] at this point is in such an order that low   */
-/* address bytes in the ORIGINAL byte stream placed in this */
-/* buffer will now go to the high end of words on BOTH big  */
-/* and little endian systems                                */
-
-VOID_RETURN sha512_hash(const unsigned char data[], unsigned long len, sha512_ctx ctx[1])
-{   uint_32t pos = (uint_32t)(ctx->count[0] & SHA512_MASK),
-             space = SHA512_BLOCK_SIZE - pos;
-    const unsigned char *sp = data;
-
-    if((ctx->count[0] += len) < len)
-        ++(ctx->count[1]);
-
-    while(len >= space)     /* tranfer whole blocks while possible  */
-    {
-        memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space);
-        sp += space; len -= space; space = SHA512_BLOCK_SIZE; pos = 0;
-        bsw_64(ctx->wbuf, SHA512_BLOCK_SIZE >> 3);
-        sha512_compile(ctx);
-    }
-
-    memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len);
-}
-
-/* SHA384/512 Final padding and digest calculation  */
-
-static void sha_end2(unsigned char hval[], sha512_ctx ctx[1], const unsigned int hlen)
-{   uint_32t    i = (uint_32t)(ctx->count[0] & SHA512_MASK);
-
-    /* put bytes in the buffer in an order in which references to   */
-    /* 32-bit words will put bytes with lower addresses into the    */
-    /* top of 32 bit words on BOTH big and little endian machines   */
-    bsw_64(ctx->wbuf, (i + 7) >> 3);
-
-    /* we now need to mask valid bytes and add the padding which is */
-    /* a single 1 bit and as many zero bits as necessary. Note that */
-    /* we can always add the first padding byte here because the    */
-    /* buffer always has at least one empty slot                    */
-    ctx->wbuf[i >> 3] &= li_64(ffffffffffffff00) << 8 * (~i & 7);
-    ctx->wbuf[i >> 3] |= li_64(0000000000000080) << 8 * (~i & 7);
-
-    /* we need 17 or more empty byte positions, one for the padding */
-    /* byte (above) and sixteen for the length count.  If there is  */
-    /* not enough space pad and empty the buffer                    */
-    if(i > SHA512_BLOCK_SIZE - 17)
-    {
-        if(i < 120) ctx->wbuf[15] = 0;
-        sha512_compile(ctx);
-        i = 0;
-    }
-    else
-        i = (i >> 3) + 1;
-
-    while(i < 14)
-        ctx->wbuf[i++] = 0;
-
-    /* the following 64-bit length fields are assembled in the      */
-    /* wrong byte order on little endian machines but this is       */
-    /* corrected later since they are only ever used as 64-bit      */
-    /* word values.                                                 */
-    ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 61);
-    ctx->wbuf[15] = ctx->count[0] << 3;
-    sha512_compile(ctx);
-
-    /* extract the hash value as bytes in case the hash buffer is   */
-    /* misaligned for 32-bit words                                  */
-    for(i = 0; i < hlen; ++i)
-        hval[i] = (unsigned char)(ctx->hash[i >> 3] >> (8 * (~i & 7)));
-}
-
-#endif
-
-#if defined(SHA_384)
-
-/* SHA384 initialisation data   */
-
-const uint_64t  i384[80] =
-{
-    li_64(cbbb9d5dc1059ed8), li_64(629a292a367cd507),
-    li_64(9159015a3070dd17), li_64(152fecd8f70e5939),
-    li_64(67332667ffc00b31), li_64(8eb44a8768581511),
-    li_64(db0c2e0d64f98fa7), li_64(47b5481dbefa4fa4)
-};
-
-VOID_RETURN sha384_begin(sha384_ctx ctx[1])
-{
-    ctx->count[0] = ctx->count[1] = 0;
-    memcpy(ctx->hash, i384, 8 * sizeof(uint_64t));
-}
-
-VOID_RETURN sha384_end(unsigned char hval[], sha384_ctx ctx[1])
-{
-    sha_end2(hval, ctx, SHA384_DIGEST_SIZE);
-}
-
-VOID_RETURN sha384(unsigned char hval[], const unsigned char data[], unsigned long len)
-{   sha384_ctx  cx[1];
-
-    sha384_begin(cx);
-    sha384_hash(data, len, cx);
-    sha_end2(hval, cx, SHA384_DIGEST_SIZE);
-}
-
-#endif
-
-#if defined(SHA_512)
-
-/* SHA512 initialisation data   */
-
-const uint_64t  i512[80] =
-{
-    li_64(6a09e667f3bcc908), li_64(bb67ae8584caa73b),
-    li_64(3c6ef372fe94f82b), li_64(a54ff53a5f1d36f1),
-    li_64(510e527fade682d1), li_64(9b05688c2b3e6c1f),
-    li_64(1f83d9abfb41bd6b), li_64(5be0cd19137e2179)
-};
-
-VOID_RETURN sha512_begin(sha512_ctx ctx[1])
-{
-    ctx->count[0] = ctx->count[1] = 0;
-    memcpy(ctx->hash, i512, 8 * sizeof(uint_64t));
-}
-
-VOID_RETURN sha512_end(unsigned char hval[], sha512_ctx ctx[1])
-{
-    sha_end2(hval, ctx, SHA512_DIGEST_SIZE);
-}
-
-VOID_RETURN sha512(unsigned char hval[], const unsigned char data[], unsigned long len)
-{   sha512_ctx  cx[1];
-
-    sha512_begin(cx);
-    sha512_hash(data, len, cx);
-    sha_end2(hval, cx, SHA512_DIGEST_SIZE);
-}
-
-#endif
-
-#if defined(SHA_2)
-
-#define CTX_224(x)  ((x)->uu->ctx256)
-#define CTX_256(x)  ((x)->uu->ctx256)
-#define CTX_384(x)  ((x)->uu->ctx512)
-#define CTX_512(x)  ((x)->uu->ctx512)
-
-/* SHA2 initialisation */
-
-INT_RETURN sha2_begin(unsigned long len, sha2_ctx ctx[1])
-{
-    switch(len)
-    {
-#if defined(SHA_224)
-        case 224:
-        case  28:   CTX_256(ctx)->count[0] = CTX_256(ctx)->count[1] = 0;
-                    memcpy(CTX_256(ctx)->hash, i224, 32);
-                    ctx->sha2_len = 28; return EXIT_SUCCESS;
-#endif
-#if defined(SHA_256)
-        case 256:
-        case  32:   CTX_256(ctx)->count[0] = CTX_256(ctx)->count[1] = 0;
-                    memcpy(CTX_256(ctx)->hash, i256, 32);
-                    ctx->sha2_len = 32; return EXIT_SUCCESS;
-#endif
-#if defined(SHA_384)
-        case 384:
-        case  48:   CTX_384(ctx)->count[0] = CTX_384(ctx)->count[1] = 0;
-                    memcpy(CTX_384(ctx)->hash, i384, 64);
-                    ctx->sha2_len = 48; return EXIT_SUCCESS;
-#endif
-#if defined(SHA_512)
-        case 512:
-        case  64:   CTX_512(ctx)->count[0] = CTX_512(ctx)->count[1] = 0;
-                    memcpy(CTX_512(ctx)->hash, i512, 64);
-                    ctx->sha2_len = 64; return EXIT_SUCCESS;
-#endif
-        default:    return EXIT_FAILURE;
-    }
-}
-
-VOID_RETURN sha2_hash(const unsigned char data[], unsigned long len, sha2_ctx ctx[1])
-{
-    switch(ctx->sha2_len)
-    {
-#if defined(SHA_224)
-        case 28: sha224_hash(data, len, CTX_224(ctx)); return;
-#endif
-#if defined(SHA_256)
-        case 32: sha256_hash(data, len, CTX_256(ctx)); return;
-#endif
-#if defined(SHA_384)
-        case 48: sha384_hash(data, len, CTX_384(ctx)); return;
-#endif
-#if defined(SHA_512)
-        case 64: sha512_hash(data, len, CTX_512(ctx)); return;
-#endif
-    }
-}
-
-VOID_RETURN sha2_end(unsigned char hval[], sha2_ctx ctx[1])
-{
-    switch(ctx->sha2_len)
-    {
-#if defined(SHA_224)
-        case 28: sha_end1(hval, CTX_224(ctx), SHA224_DIGEST_SIZE); return;
-#endif
-#if defined(SHA_256)
-        case 32: sha_end1(hval, CTX_256(ctx), SHA256_DIGEST_SIZE); return;
-#endif
-#if defined(SHA_384)
-        case 48: sha_end2(hval, CTX_384(ctx), SHA384_DIGEST_SIZE); return;
-#endif
-#if defined(SHA_512)
-        case 64: sha_end2(hval, CTX_512(ctx), SHA512_DIGEST_SIZE); return;
-#endif
-    }
-}
-
-INT_RETURN sha2(unsigned char hval[], unsigned long size,
-                                const unsigned char data[], unsigned long len)
-{   sha2_ctx    cx[1];
-
-    if(sha2_begin(size, cx) == EXIT_SUCCESS)
-    {
-        sha2_hash(data, len, cx); sha2_end(hval, cx); return EXIT_SUCCESS;
-    }
-    else
-        return EXIT_FAILURE;
-}
-
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+
+ This is a byte oriented version of SHA2 that operates on arrays of bytes
+ stored in memory. This code implements sha256, sha384 and sha512 but the
+ latter two functions rely on efficient 64-bit integer operations that
+ may not be very efficient on 32-bit machines
+
+ The sha256 functions use a type 'sha256_ctx' to hold details of the
+ current hash state and uses the following three calls:
+
+       void sha256_begin(sha256_ctx ctx[1])
+       void sha256_hash(const unsigned char data[],
+                            unsigned long len, sha256_ctx ctx[1])
+       void sha_end1(unsigned char hval[], sha256_ctx ctx[1])
+
+ The first subroutine initialises a hash computation by setting up the
+ context in the sha256_ctx context. The second subroutine hashes 8-bit
+ bytes from array data[] into the hash state withinh sha256_ctx context,
+ the number of bytes to be hashed being given by the the unsigned long
+ integer len.  The third subroutine completes the hash calculation and
+ places the resulting digest value in the array of 8-bit bytes hval[].
+
+ The sha384 and sha512 functions are similar and use the interfaces:
+
+       void sha384_begin(sha384_ctx ctx[1]);
+       void sha384_hash(const unsigned char data[],
+                            unsigned long len, sha384_ctx ctx[1]);
+       void sha384_end(unsigned char hval[], sha384_ctx ctx[1]);
+
+       void sha512_begin(sha512_ctx ctx[1]);
+       void sha512_hash(const unsigned char data[],
+                            unsigned long len, sha512_ctx ctx[1]);
+       void sha512_end(unsigned char hval[], sha512_ctx ctx[1]);
+
+ In addition there is a function sha2 that can be used to call all these
+ functions using a call with a hash length parameter as follows:
+
+       int sha2_begin(unsigned long len, sha2_ctx ctx[1]);
+       void sha2_hash(const unsigned char data[],
+                            unsigned long len, sha2_ctx ctx[1]);
+       void sha2_end(unsigned char hval[], sha2_ctx ctx[1]);
+
+ My thanks to Erik Andersen <andersen@codepoet.org> for testing this code
+ on big-endian systems and for his assistance with corrections
+*/
+
+#include "Common/Endian.h"
+#include "Crypto/misc.h"
+#define PLATFORM_BYTE_ORDER BYTE_ORDER
+#define IS_LITTLE_ENDIAN LITTLE_ENDIAN
+
+#if 0
+#define UNROLL_SHA2     /* for SHA2 loop unroll     */
+#endif
+
+#include <string.h>     /* for memcpy() etc.        */
+
+#include "Sha2.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#if defined( _MSC_VER ) && ( _MSC_VER > 800 )
+#pragma intrinsic(memcpy)
+#endif
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define SWAP_BYTES
+#else
+#undef  SWAP_BYTES
+#endif
+
+#if 0
+
+#define ch(x,y,z)       (((x) & (y)) ^ (~(x) & (z)))
+#define maj(x,y,z)      (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+#else   /* Thanks to Rich Schroeppel and Colin Plumb for the following      */
+
+#define ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
+#define maj(x,y,z)      (((x) & (y)) | ((z) & ((x) ^ (y))))
+
+#endif
+
+/* round transforms for SHA256 and SHA512 compression functions */
+
+#define vf(n,i) v[(n - i) & 7]
+
+#define hf(i) (p[i & 15] += \
+    g_1(p[(i + 14) & 15]) + p[(i + 9) & 15] + g_0(p[(i + 1) & 15]))
+
+#define v_cycle(i,j)                                \
+    vf(7,i) += (j ? hf(i) : p[i]) + k_0[i+j]        \
+    + s_1(vf(4,i)) + ch(vf(4,i),vf(5,i),vf(6,i));   \
+    vf(3,i) += vf(7,i);                             \
+    vf(7,i) += s_0(vf(0,i))+ maj(vf(0,i),vf(1,i),vf(2,i))
+
+#if defined(SHA_224) || defined(SHA_256)
+
+#define SHA256_MASK (SHA256_BLOCK_SIZE - 1)
+
+#if defined(SWAP_BYTES)
+#define bsw_32(p,n) \
+    { int _i = (n); while(_i--) ((uint_32t*)p)[_i] = bswap_32(((uint_32t*)p)[_i]); }
+#else
+#define bsw_32(p,n)
+#endif
+
+#define s_0(x)  (rotr32((x),  2) ^ rotr32((x), 13) ^ rotr32((x), 22))
+#define s_1(x)  (rotr32((x),  6) ^ rotr32((x), 11) ^ rotr32((x), 25))
+#define g_0(x)  (rotr32((x),  7) ^ rotr32((x), 18) ^ ((x) >>  3))
+#define g_1(x)  (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10))
+#define k_0     k256
+
+/* rotated SHA256 round definition. Rather than swapping variables as in    */
+/* FIPS-180, different variables are 'rotated' on each round, returning     */
+/* to their starting positions every eight rounds                           */
+
+#define q(n)  v##n
+
+#define one_cycle(a,b,c,d,e,f,g,h,k,w)  \
+    q(h) += s_1(q(e)) + ch(q(e), q(f), q(g)) + k + w; \
+    q(d) += q(h); q(h) += s_0(q(a)) + maj(q(a), q(b), q(c))
+
+/* SHA256 mixing data   */
+
+const uint_32t k256[64] =
+{   0x428a2f98ul, 0x71374491ul, 0xb5c0fbcful, 0xe9b5dba5ul,
+    0x3956c25bul, 0x59f111f1ul, 0x923f82a4ul, 0xab1c5ed5ul,
+    0xd807aa98ul, 0x12835b01ul, 0x243185beul, 0x550c7dc3ul,
+    0x72be5d74ul, 0x80deb1feul, 0x9bdc06a7ul, 0xc19bf174ul,
+    0xe49b69c1ul, 0xefbe4786ul, 0x0fc19dc6ul, 0x240ca1ccul,
+    0x2de92c6ful, 0x4a7484aaul, 0x5cb0a9dcul, 0x76f988daul,
+    0x983e5152ul, 0xa831c66dul, 0xb00327c8ul, 0xbf597fc7ul,
+    0xc6e00bf3ul, 0xd5a79147ul, 0x06ca6351ul, 0x14292967ul,
+    0x27b70a85ul, 0x2e1b2138ul, 0x4d2c6dfcul, 0x53380d13ul,
+    0x650a7354ul, 0x766a0abbul, 0x81c2c92eul, 0x92722c85ul,
+    0xa2bfe8a1ul, 0xa81a664bul, 0xc24b8b70ul, 0xc76c51a3ul,
+    0xd192e819ul, 0xd6990624ul, 0xf40e3585ul, 0x106aa070ul,
+    0x19a4c116ul, 0x1e376c08ul, 0x2748774cul, 0x34b0bcb5ul,
+    0x391c0cb3ul, 0x4ed8aa4aul, 0x5b9cca4ful, 0x682e6ff3ul,
+    0x748f82eeul, 0x78a5636ful, 0x84c87814ul, 0x8cc70208ul,
+    0x90befffaul, 0xa4506cebul, 0xbef9a3f7ul, 0xc67178f2ul,
+};
+
+/* Compile 64 bytes of hash data into SHA256 digest value   */
+/* NOTE: this routine assumes that the byte order in the    */
+/* ctx->wbuf[] at this point is such that low address bytes */
+/* in the ORIGINAL byte stream will go into the high end of */
+/* words on BOTH big and little endian systems              */
+
+VOID_RETURN sha256_compile(sha256_ctx ctx[1])
+{
+#if !defined(UNROLL_SHA2)
+
+    uint_32t j, *p = ctx->wbuf, v[8];
+
+    memcpy(v, ctx->hash, 8 * sizeof(uint_32t));
+
+    for(j = 0; j < 64; j += 16)
+    {
+        v_cycle( 0, j); v_cycle( 1, j);
+        v_cycle( 2, j); v_cycle( 3, j);
+        v_cycle( 4, j); v_cycle( 5, j);
+        v_cycle( 6, j); v_cycle( 7, j);
+        v_cycle( 8, j); v_cycle( 9, j);
+        v_cycle(10, j); v_cycle(11, j);
+        v_cycle(12, j); v_cycle(13, j);
+        v_cycle(14, j); v_cycle(15, j);
+    }
+
+    ctx->hash[0] += v[0]; ctx->hash[1] += v[1];
+    ctx->hash[2] += v[2]; ctx->hash[3] += v[3];
+    ctx->hash[4] += v[4]; ctx->hash[5] += v[5];
+    ctx->hash[6] += v[6]; ctx->hash[7] += v[7];
+
+#else
+
+    uint_32t *p = ctx->wbuf,v0,v1,v2,v3,v4,v5,v6,v7;
+
+    v0 = ctx->hash[0]; v1 = ctx->hash[1];
+    v2 = ctx->hash[2]; v3 = ctx->hash[3];
+    v4 = ctx->hash[4]; v5 = ctx->hash[5];
+    v6 = ctx->hash[6]; v7 = ctx->hash[7];
+
+    one_cycle(0,1,2,3,4,5,6,7,k256[ 0],p[ 0]);
+    one_cycle(7,0,1,2,3,4,5,6,k256[ 1],p[ 1]);
+    one_cycle(6,7,0,1,2,3,4,5,k256[ 2],p[ 2]);
+    one_cycle(5,6,7,0,1,2,3,4,k256[ 3],p[ 3]);
+    one_cycle(4,5,6,7,0,1,2,3,k256[ 4],p[ 4]);
+    one_cycle(3,4,5,6,7,0,1,2,k256[ 5],p[ 5]);
+    one_cycle(2,3,4,5,6,7,0,1,k256[ 6],p[ 6]);
+    one_cycle(1,2,3,4,5,6,7,0,k256[ 7],p[ 7]);
+    one_cycle(0,1,2,3,4,5,6,7,k256[ 8],p[ 8]);
+    one_cycle(7,0,1,2,3,4,5,6,k256[ 9],p[ 9]);
+    one_cycle(6,7,0,1,2,3,4,5,k256[10],p[10]);
+    one_cycle(5,6,7,0,1,2,3,4,k256[11],p[11]);
+    one_cycle(4,5,6,7,0,1,2,3,k256[12],p[12]);
+    one_cycle(3,4,5,6,7,0,1,2,k256[13],p[13]);
+    one_cycle(2,3,4,5,6,7,0,1,k256[14],p[14]);
+    one_cycle(1,2,3,4,5,6,7,0,k256[15],p[15]);
+
+    one_cycle(0,1,2,3,4,5,6,7,k256[16],hf( 0));
+    one_cycle(7,0,1,2,3,4,5,6,k256[17],hf( 1));
+    one_cycle(6,7,0,1,2,3,4,5,k256[18],hf( 2));
+    one_cycle(5,6,7,0,1,2,3,4,k256[19],hf( 3));
+    one_cycle(4,5,6,7,0,1,2,3,k256[20],hf( 4));
+    one_cycle(3,4,5,6,7,0,1,2,k256[21],hf( 5));
+    one_cycle(2,3,4,5,6,7,0,1,k256[22],hf( 6));
+    one_cycle(1,2,3,4,5,6,7,0,k256[23],hf( 7));
+    one_cycle(0,1,2,3,4,5,6,7,k256[24],hf( 8));
+    one_cycle(7,0,1,2,3,4,5,6,k256[25],hf( 9));
+    one_cycle(6,7,0,1,2,3,4,5,k256[26],hf(10));
+    one_cycle(5,6,7,0,1,2,3,4,k256[27],hf(11));
+    one_cycle(4,5,6,7,0,1,2,3,k256[28],hf(12));
+    one_cycle(3,4,5,6,7,0,1,2,k256[29],hf(13));
+    one_cycle(2,3,4,5,6,7,0,1,k256[30],hf(14));
+    one_cycle(1,2,3,4,5,6,7,0,k256[31],hf(15));
+
+    one_cycle(0,1,2,3,4,5,6,7,k256[32],hf( 0));
+    one_cycle(7,0,1,2,3,4,5,6,k256[33],hf( 1));
+    one_cycle(6,7,0,1,2,3,4,5,k256[34],hf( 2));
+    one_cycle(5,6,7,0,1,2,3,4,k256[35],hf( 3));
+    one_cycle(4,5,6,7,0,1,2,3,k256[36],hf( 4));
+    one_cycle(3,4,5,6,7,0,1,2,k256[37],hf( 5));
+    one_cycle(2,3,4,5,6,7,0,1,k256[38],hf( 6));
+    one_cycle(1,2,3,4,5,6,7,0,k256[39],hf( 7));
+    one_cycle(0,1,2,3,4,5,6,7,k256[40],hf( 8));
+    one_cycle(7,0,1,2,3,4,5,6,k256[41],hf( 9));
+    one_cycle(6,7,0,1,2,3,4,5,k256[42],hf(10));
+    one_cycle(5,6,7,0,1,2,3,4,k256[43],hf(11));
+    one_cycle(4,5,6,7,0,1,2,3,k256[44],hf(12));
+    one_cycle(3,4,5,6,7,0,1,2,k256[45],hf(13));
+    one_cycle(2,3,4,5,6,7,0,1,k256[46],hf(14));
+    one_cycle(1,2,3,4,5,6,7,0,k256[47],hf(15));
+
+    one_cycle(0,1,2,3,4,5,6,7,k256[48],hf( 0));
+    one_cycle(7,0,1,2,3,4,5,6,k256[49],hf( 1));
+    one_cycle(6,7,0,1,2,3,4,5,k256[50],hf( 2));
+    one_cycle(5,6,7,0,1,2,3,4,k256[51],hf( 3));
+    one_cycle(4,5,6,7,0,1,2,3,k256[52],hf( 4));
+    one_cycle(3,4,5,6,7,0,1,2,k256[53],hf( 5));
+    one_cycle(2,3,4,5,6,7,0,1,k256[54],hf( 6));
+    one_cycle(1,2,3,4,5,6,7,0,k256[55],hf( 7));
+    one_cycle(0,1,2,3,4,5,6,7,k256[56],hf( 8));
+    one_cycle(7,0,1,2,3,4,5,6,k256[57],hf( 9));
+    one_cycle(6,7,0,1,2,3,4,5,k256[58],hf(10));
+    one_cycle(5,6,7,0,1,2,3,4,k256[59],hf(11));
+    one_cycle(4,5,6,7,0,1,2,3,k256[60],hf(12));
+    one_cycle(3,4,5,6,7,0,1,2,k256[61],hf(13));
+    one_cycle(2,3,4,5,6,7,0,1,k256[62],hf(14));
+    one_cycle(1,2,3,4,5,6,7,0,k256[63],hf(15));
+
+    ctx->hash[0] += v0; ctx->hash[1] += v1;
+    ctx->hash[2] += v2; ctx->hash[3] += v3;
+    ctx->hash[4] += v4; ctx->hash[5] += v5;
+    ctx->hash[6] += v6; ctx->hash[7] += v7;
+#endif
+}
+
+/* SHA256 hash data in an array of bytes into hash buffer   */
+/* and call the hash_compile function as required.          */
+
+VOID_RETURN sha256_hash(const unsigned char data[], unsigned long len, sha256_ctx ctx[1])
+{   uint_32t pos = (uint_32t)(ctx->count[0] & SHA256_MASK),
+             space = SHA256_BLOCK_SIZE - pos;
+    const unsigned char *sp = data;
+
+    if((ctx->count[0] += len) < len)
+        ++(ctx->count[1]);
+
+    while(len >= space)     /* tranfer whole blocks while possible  */
+    {
+        memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space);
+        sp += space; len -= space; space = SHA256_BLOCK_SIZE; pos = 0;
+        bsw_32(ctx->wbuf, SHA256_BLOCK_SIZE >> 2)
+        sha256_compile(ctx);
+    }
+
+    memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len);
+}
+
+/* SHA256 Final padding and digest calculation  */
+
+static void sha_end1(unsigned char hval[], sha256_ctx ctx[1], const unsigned int hlen)
+{   uint_32t    i = (uint_32t)(ctx->count[0] & SHA256_MASK);
+
+    /* put bytes in the buffer in an order in which references to   */
+    /* 32-bit words will put bytes with lower addresses into the    */
+    /* top of 32 bit words on BOTH big and little endian machines   */
+    bsw_32(ctx->wbuf, (i + 3) >> 2)
+
+    /* we now need to mask valid bytes and add the padding which is */
+    /* a single 1 bit and as many zero bits as necessary. Note that */
+    /* we can always add the first padding byte here because the    */
+    /* buffer always has at least one empty slot                    */
+    ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3);
+    ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3);
+
+    /* we need 9 or more empty positions, one for the padding byte  */
+    /* (above) and eight for the length count.  If there is not     */
+    /* enough space pad and empty the buffer                        */
+    if(i > SHA256_BLOCK_SIZE - 9)
+    {
+        if(i < 60) ctx->wbuf[15] = 0;
+        sha256_compile(ctx);
+        i = 0;
+    }
+    else    /* compute a word index for the empty buffer positions  */
+        i = (i >> 2) + 1;
+
+    while(i < 14) /* and zero pad all but last two positions        */
+        ctx->wbuf[i++] = 0;
+
+    /* the following 32-bit length fields are assembled in the      */
+    /* wrong byte order on little endian machines but this is       */
+    /* corrected later since they are only ever used as 32-bit      */
+    /* word values.                                                 */
+    ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29);
+    ctx->wbuf[15] = ctx->count[0] << 3;
+    sha256_compile(ctx);
+
+    /* extract the hash value as bytes in case the hash buffer is   */
+    /* mislaigned for 32-bit words                                  */
+    for(i = 0; i < hlen; ++i)
+        hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3)));
+}
+
+#endif
+
+#if defined(SHA_224)
+
+const uint_32t i224[8] =
+{
+    0xc1059ed8ul, 0x367cd507ul, 0x3070dd17ul, 0xf70e5939ul,
+    0xffc00b31ul, 0x68581511ul, 0x64f98fa7ul, 0xbefa4fa4ul
+};
+
+VOID_RETURN sha224_begin(sha224_ctx ctx[1])
+{
+    ctx->count[0] = ctx->count[1] = 0;
+    memcpy(ctx->hash, i224, 8 * sizeof(uint_32t));
+}
+
+VOID_RETURN sha224_end(unsigned char hval[], sha224_ctx ctx[1])
+{
+    sha_end1(hval, ctx, SHA224_DIGEST_SIZE);
+}
+
+VOID_RETURN sha224(unsigned char hval[], const unsigned char data[], unsigned long len)
+{   sha224_ctx  cx[1];
+
+    sha224_begin(cx);
+    sha224_hash(data, len, cx);
+    sha_end1(hval, cx, SHA224_DIGEST_SIZE);
+}
+
+#endif
+
+#if defined(SHA_256)
+
+const uint_32t i256[8] =
+{
+    0x6a09e667ul, 0xbb67ae85ul, 0x3c6ef372ul, 0xa54ff53aul,
+    0x510e527ful, 0x9b05688cul, 0x1f83d9abul, 0x5be0cd19ul
+};
+
+VOID_RETURN sha256_begin(sha256_ctx ctx[1])
+{
+    ctx->count[0] = ctx->count[1] = 0;
+    memcpy(ctx->hash, i256, 8 * sizeof(uint_32t));
+}
+
+VOID_RETURN sha256_end(unsigned char hval[], sha256_ctx ctx[1])
+{
+    sha_end1(hval, ctx, SHA256_DIGEST_SIZE);
+}
+
+VOID_RETURN sha256(unsigned char hval[], const unsigned char data[], unsigned long len)
+{   sha256_ctx  cx[1];
+
+    sha256_begin(cx);
+    sha256_hash(data, len, cx);
+    sha_end1(hval, cx, SHA256_DIGEST_SIZE);
+}
+
+#endif
+
+#if defined(SHA_384) || defined(SHA_512)
+
+#define SHA512_MASK (SHA512_BLOCK_SIZE - 1)
+
+#if defined(SWAP_BYTES)
+#define bsw_64(p,n) \
+    { int _i = (n); while(_i--) ((uint_64t*)p)[_i] = bswap_64(((uint_64t*)p)[_i]); }
+#else
+#define bsw_64(p,n)
+#endif
+
+/* SHA512 mixing function definitions   */
+
+#ifdef   s_0
+# undef  s_0
+# undef  s_1
+# undef  g_0
+# undef  g_1
+# undef  k_0
+#endif
+
+#define s_0(x)  (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39))
+#define s_1(x)  (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41))
+#define g_0(x)  (rotr64((x),  1) ^ rotr64((x),  8) ^ ((x) >>  7))
+#define g_1(x)  (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >>  6))
+#define k_0     k512
+
+/* SHA384/SHA512 mixing data    */
+
+const uint_64t  k512[80] =
+{
+    li_64(428a2f98d728ae22), li_64(7137449123ef65cd),
+    li_64(b5c0fbcfec4d3b2f), li_64(e9b5dba58189dbbc),
+    li_64(3956c25bf348b538), li_64(59f111f1b605d019),
+    li_64(923f82a4af194f9b), li_64(ab1c5ed5da6d8118),
+    li_64(d807aa98a3030242), li_64(12835b0145706fbe),
+    li_64(243185be4ee4b28c), li_64(550c7dc3d5ffb4e2),
+    li_64(72be5d74f27b896f), li_64(80deb1fe3b1696b1),
+    li_64(9bdc06a725c71235), li_64(c19bf174cf692694),
+    li_64(e49b69c19ef14ad2), li_64(efbe4786384f25e3),
+    li_64(0fc19dc68b8cd5b5), li_64(240ca1cc77ac9c65),
+    li_64(2de92c6f592b0275), li_64(4a7484aa6ea6e483),
+    li_64(5cb0a9dcbd41fbd4), li_64(76f988da831153b5),
+    li_64(983e5152ee66dfab), li_64(a831c66d2db43210),
+    li_64(b00327c898fb213f), li_64(bf597fc7beef0ee4),
+    li_64(c6e00bf33da88fc2), li_64(d5a79147930aa725),
+    li_64(06ca6351e003826f), li_64(142929670a0e6e70),
+    li_64(27b70a8546d22ffc), li_64(2e1b21385c26c926),
+    li_64(4d2c6dfc5ac42aed), li_64(53380d139d95b3df),
+    li_64(650a73548baf63de), li_64(766a0abb3c77b2a8),
+    li_64(81c2c92e47edaee6), li_64(92722c851482353b),
+    li_64(a2bfe8a14cf10364), li_64(a81a664bbc423001),
+    li_64(c24b8b70d0f89791), li_64(c76c51a30654be30),
+    li_64(d192e819d6ef5218), li_64(d69906245565a910),
+    li_64(f40e35855771202a), li_64(106aa07032bbd1b8),
+    li_64(19a4c116b8d2d0c8), li_64(1e376c085141ab53),
+    li_64(2748774cdf8eeb99), li_64(34b0bcb5e19b48a8),
+    li_64(391c0cb3c5c95a63), li_64(4ed8aa4ae3418acb),
+    li_64(5b9cca4f7763e373), li_64(682e6ff3d6b2b8a3),
+    li_64(748f82ee5defb2fc), li_64(78a5636f43172f60),
+    li_64(84c87814a1f0ab72), li_64(8cc702081a6439ec),
+    li_64(90befffa23631e28), li_64(a4506cebde82bde9),
+    li_64(bef9a3f7b2c67915), li_64(c67178f2e372532b),
+    li_64(ca273eceea26619c), li_64(d186b8c721c0c207),
+    li_64(eada7dd6cde0eb1e), li_64(f57d4f7fee6ed178),
+    li_64(06f067aa72176fba), li_64(0a637dc5a2c898a6),
+    li_64(113f9804bef90dae), li_64(1b710b35131c471b),
+    li_64(28db77f523047d84), li_64(32caab7b40c72493),
+    li_64(3c9ebe0a15c9bebc), li_64(431d67c49c100d4c),
+    li_64(4cc5d4becb3e42b6), li_64(597f299cfc657e2a),
+    li_64(5fcb6fab3ad6faec), li_64(6c44198c4a475817)
+};
+
+/* Compile 128 bytes of hash data into SHA384/512 digest    */
+/* NOTE: this routine assumes that the byte order in the    */
+/* ctx->wbuf[] at this point is such that low address bytes */
+/* in the ORIGINAL byte stream will go into the high end of */
+/* words on BOTH big and little endian systems              */
+
+VOID_RETURN sha512_compile(sha512_ctx ctx[1])
+{   uint_64t    v[8], *p = ctx->wbuf;
+    uint_32t    j;
+
+    memcpy(v, ctx->hash, 8 * sizeof(uint_64t));
+
+    for(j = 0; j < 80; j += 16)
+    {
+        v_cycle( 0, j); v_cycle( 1, j);
+        v_cycle( 2, j); v_cycle( 3, j);
+        v_cycle( 4, j); v_cycle( 5, j);
+        v_cycle( 6, j); v_cycle( 7, j);
+        v_cycle( 8, j); v_cycle( 9, j);
+        v_cycle(10, j); v_cycle(11, j);
+        v_cycle(12, j); v_cycle(13, j);
+        v_cycle(14, j); v_cycle(15, j);
+    }
+
+    ctx->hash[0] += v[0]; ctx->hash[1] += v[1];
+    ctx->hash[2] += v[2]; ctx->hash[3] += v[3];
+    ctx->hash[4] += v[4]; ctx->hash[5] += v[5];
+    ctx->hash[6] += v[6]; ctx->hash[7] += v[7];
+}
+
+/* Compile 128 bytes of hash data into SHA256 digest value  */
+/* NOTE: this routine assumes that the byte order in the    */
+/* ctx->wbuf[] at this point is in such an order that low   */
+/* address bytes in the ORIGINAL byte stream placed in this */
+/* buffer will now go to the high end of words on BOTH big  */
+/* and little endian systems                                */
+
+VOID_RETURN sha512_hash(const unsigned char data[], unsigned long len, sha512_ctx ctx[1])
+{   uint_32t pos = (uint_32t)(ctx->count[0] & SHA512_MASK),
+             space = SHA512_BLOCK_SIZE - pos;
+    const unsigned char *sp = data;
+
+    if((ctx->count[0] += len) < len)
+        ++(ctx->count[1]);
+
+    while(len >= space)     /* tranfer whole blocks while possible  */
+    {
+        memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space);
+        sp += space; len -= space; space = SHA512_BLOCK_SIZE; pos = 0;
+        bsw_64(ctx->wbuf, SHA512_BLOCK_SIZE >> 3);
+        sha512_compile(ctx);
+    }
+
+    memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len);
+}
+
+/* SHA384/512 Final padding and digest calculation  */
+
+static void sha_end2(unsigned char hval[], sha512_ctx ctx[1], const unsigned int hlen)
+{   uint_32t    i = (uint_32t)(ctx->count[0] & SHA512_MASK);
+
+    /* put bytes in the buffer in an order in which references to   */
+    /* 32-bit words will put bytes with lower addresses into the    */
+    /* top of 32 bit words on BOTH big and little endian machines   */
+    bsw_64(ctx->wbuf, (i + 7) >> 3);
+
+    /* we now need to mask valid bytes and add the padding which is */
+    /* a single 1 bit and as many zero bits as necessary. Note that */
+    /* we can always add the first padding byte here because the    */
+    /* buffer always has at least one empty slot                    */
+    ctx->wbuf[i >> 3] &= li_64(ffffffffffffff00) << 8 * (~i & 7);
+    ctx->wbuf[i >> 3] |= li_64(0000000000000080) << 8 * (~i & 7);
+
+    /* we need 17 or more empty byte positions, one for the padding */
+    /* byte (above) and sixteen for the length count.  If there is  */
+    /* not enough space pad and empty the buffer                    */
+    if(i > SHA512_BLOCK_SIZE - 17)
+    {
+        if(i < 120) ctx->wbuf[15] = 0;
+        sha512_compile(ctx);
+        i = 0;
+    }
+    else
+        i = (i >> 3) + 1;
+
+    while(i < 14)
+        ctx->wbuf[i++] = 0;
+
+    /* the following 64-bit length fields are assembled in the      */
+    /* wrong byte order on little endian machines but this is       */
+    /* corrected later since they are only ever used as 64-bit      */
+    /* word values.                                                 */
+    ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 61);
+    ctx->wbuf[15] = ctx->count[0] << 3;
+    sha512_compile(ctx);
+
+    /* extract the hash value as bytes in case the hash buffer is   */
+    /* misaligned for 32-bit words                                  */
+    for(i = 0; i < hlen; ++i)
+        hval[i] = (unsigned char)(ctx->hash[i >> 3] >> (8 * (~i & 7)));
+}
+
+#endif
+
+#if defined(SHA_384)
+
+/* SHA384 initialisation data   */
+
+const uint_64t  i384[80] =
+{
+    li_64(cbbb9d5dc1059ed8), li_64(629a292a367cd507),
+    li_64(9159015a3070dd17), li_64(152fecd8f70e5939),
+    li_64(67332667ffc00b31), li_64(8eb44a8768581511),
+    li_64(db0c2e0d64f98fa7), li_64(47b5481dbefa4fa4)
+};
+
+VOID_RETURN sha384_begin(sha384_ctx ctx[1])
+{
+    ctx->count[0] = ctx->count[1] = 0;
+    memcpy(ctx->hash, i384, 8 * sizeof(uint_64t));
+}
+
+VOID_RETURN sha384_end(unsigned char hval[], sha384_ctx ctx[1])
+{
+    sha_end2(hval, ctx, SHA384_DIGEST_SIZE);
+}
+
+VOID_RETURN sha384(unsigned char hval[], const unsigned char data[], unsigned long len)
+{   sha384_ctx  cx[1];
+
+    sha384_begin(cx);
+    sha384_hash(data, len, cx);
+    sha_end2(hval, cx, SHA384_DIGEST_SIZE);
+}
+
+#endif
+
+#if defined(SHA_512)
+
+/* SHA512 initialisation data   */
+
+const uint_64t  i512[80] =
+{
+    li_64(6a09e667f3bcc908), li_64(bb67ae8584caa73b),
+    li_64(3c6ef372fe94f82b), li_64(a54ff53a5f1d36f1),
+    li_64(510e527fade682d1), li_64(9b05688c2b3e6c1f),
+    li_64(1f83d9abfb41bd6b), li_64(5be0cd19137e2179)
+};
+
+VOID_RETURN sha512_begin(sha512_ctx ctx[1])
+{
+    ctx->count[0] = ctx->count[1] = 0;
+    memcpy(ctx->hash, i512, 8 * sizeof(uint_64t));
+}
+
+VOID_RETURN sha512_end(unsigned char hval[], sha512_ctx ctx[1])
+{
+    sha_end2(hval, ctx, SHA512_DIGEST_SIZE);
+}
+
+VOID_RETURN sha512(unsigned char hval[], const unsigned char data[], unsigned long len)
+{   sha512_ctx  cx[1];
+
+    sha512_begin(cx);
+    sha512_hash(data, len, cx);
+    sha_end2(hval, cx, SHA512_DIGEST_SIZE);
+}
+
+#endif
+
+#if defined(SHA_2)
+
+#define CTX_224(x)  ((x)->uu->ctx256)
+#define CTX_256(x)  ((x)->uu->ctx256)
+#define CTX_384(x)  ((x)->uu->ctx512)
+#define CTX_512(x)  ((x)->uu->ctx512)
+
+/* SHA2 initialisation */
+
+INT_RETURN sha2_begin(unsigned long len, sha2_ctx ctx[1])
+{
+    switch(len)
+    {
+#if defined(SHA_224)
+        case 224:
+        case  28:   CTX_256(ctx)->count[0] = CTX_256(ctx)->count[1] = 0;
+                    memcpy(CTX_256(ctx)->hash, i224, 32);
+                    ctx->sha2_len = 28; return EXIT_SUCCESS;
+#endif
+#if defined(SHA_256)
+        case 256:
+        case  32:   CTX_256(ctx)->count[0] = CTX_256(ctx)->count[1] = 0;
+                    memcpy(CTX_256(ctx)->hash, i256, 32);
+                    ctx->sha2_len = 32; return EXIT_SUCCESS;
+#endif
+#if defined(SHA_384)
+        case 384:
+        case  48:   CTX_384(ctx)->count[0] = CTX_384(ctx)->count[1] = 0;
+                    memcpy(CTX_384(ctx)->hash, i384, 64);
+                    ctx->sha2_len = 48; return EXIT_SUCCESS;
+#endif
+#if defined(SHA_512)
+        case 512:
+        case  64:   CTX_512(ctx)->count[0] = CTX_512(ctx)->count[1] = 0;
+                    memcpy(CTX_512(ctx)->hash, i512, 64);
+                    ctx->sha2_len = 64; return EXIT_SUCCESS;
+#endif
+        default:    return EXIT_FAILURE;
+    }
+}
+
+VOID_RETURN sha2_hash(const unsigned char data[], unsigned long len, sha2_ctx ctx[1])
+{
+    switch(ctx->sha2_len)
+    {
+#if defined(SHA_224)
+        case 28: sha224_hash(data, len, CTX_224(ctx)); return;
+#endif
+#if defined(SHA_256)
+        case 32: sha256_hash(data, len, CTX_256(ctx)); return;
+#endif
+#if defined(SHA_384)
+        case 48: sha384_hash(data, len, CTX_384(ctx)); return;
+#endif
+#if defined(SHA_512)
+        case 64: sha512_hash(data, len, CTX_512(ctx)); return;
+#endif
+    }
+}
+
+VOID_RETURN sha2_end(unsigned char hval[], sha2_ctx ctx[1])
+{
+    switch(ctx->sha2_len)
+    {
+#if defined(SHA_224)
+        case 28: sha_end1(hval, CTX_224(ctx), SHA224_DIGEST_SIZE); return;
+#endif
+#if defined(SHA_256)
+        case 32: sha_end1(hval, CTX_256(ctx), SHA256_DIGEST_SIZE); return;
+#endif
+#if defined(SHA_384)
+        case 48: sha_end2(hval, CTX_384(ctx), SHA384_DIGEST_SIZE); return;
+#endif
+#if defined(SHA_512)
+        case 64: sha_end2(hval, CTX_512(ctx), SHA512_DIGEST_SIZE); return;
+#endif
+    }
+}
+
+INT_RETURN sha2(unsigned char hval[], unsigned long size,
+                                const unsigned char data[], unsigned long len)
+{   sha2_ctx    cx[1];
+
+    if(sha2_begin(size, cx) == EXIT_SUCCESS)
+    {
+        sha2_hash(data, len, cx); sha2_end(hval, cx); return EXIT_SUCCESS;
+    }
+    else
+        return EXIT_FAILURE;
+}
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/src/Crypto/Sha2.h b/src/Crypto/Sha2.h
index 64379d17..6d0aeb0f 100644
--- a/src/Crypto/Sha2.h
+++ b/src/Crypto/Sha2.h
@@ -1,155 +1,155 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 01/08/2005
-*/
-
-#ifndef _SHA2_H
-#define _SHA2_H
-
-#include "Common/Tcdefs.h"
-#include "Common/Endian.h"
-
-#define SHA_64BIT
-
-/* define the hash functions that you need  */
-#define SHA_2   /* for dynamic hash length  */
-#define SHA_224
-#define SHA_256
-#ifdef SHA_64BIT
-#  define SHA_384
-#  define SHA_512
-#  define NEED_UINT_64T
-#endif
-
-#ifndef EXIT_SUCCESS
-#define EXIT_SUCCESS    0
-#define EXIT_FAILURE    1
-#endif
-
-#define li_64(h) 0x##h##ull
-
-#define VOID_RETURN	void
-#define INT_RETURN	int
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-/* Note that the following function prototypes are the same */
-/* for both the bit and byte oriented implementations.  But */
-/* the length fields are in bytes or bits as is appropriate */
-/* for the version used.  Bit sequences are arrays of bytes */
-/* in which bit sequence indexes increase from the most to  */
-/* the least significant end of each byte                   */
-
-#define SHA224_DIGEST_SIZE  28
-#define SHA224_BLOCK_SIZE   64
-#define SHA256_DIGEST_SIZE  32
-#define SHA256_BLOCK_SIZE   64
-
-/* type to hold the SHA256 (and SHA224) context */
-
-typedef struct
-{   uint_32t count[2];
-    uint_32t hash[8];
-    uint_32t wbuf[16];
-} sha256_ctx;
-
-typedef sha256_ctx  sha224_ctx;
-
-VOID_RETURN sha256_compile(sha256_ctx ctx[1]);
-
-VOID_RETURN sha224_begin(sha224_ctx ctx[1]);
-#define sha224_hash sha256_hash
-VOID_RETURN sha224_end(unsigned char hval[], sha224_ctx ctx[1]);
-VOID_RETURN sha224(unsigned char hval[], const unsigned char data[], unsigned long len);
-
-VOID_RETURN sha256_begin(sha256_ctx ctx[1]);
-VOID_RETURN sha256_hash(const unsigned char data[], unsigned long len, sha256_ctx ctx[1]);
-VOID_RETURN sha256_end(unsigned char hval[], sha256_ctx ctx[1]);
-VOID_RETURN sha256(unsigned char hval[], const unsigned char data[], unsigned long len);
-
-#ifndef SHA_64BIT
-
-typedef struct
-{   union
-    { sha256_ctx  ctx256[1];
-    } uu[1];
-    uint_32t    sha2_len;
-} sha2_ctx;
-
-#define SHA2_MAX_DIGEST_SIZE    SHA256_DIGEST_SIZE
-
-#else
-
-#define SHA384_DIGEST_SIZE  48
-#define SHA384_BLOCK_SIZE  128
-#define SHA512_DIGEST_SIZE  64
-#define SHA512_BLOCK_SIZE  128
-#define SHA2_MAX_DIGEST_SIZE    SHA512_DIGEST_SIZE
-
-/* type to hold the SHA384 (and SHA512) context */
-
-typedef struct
-{   uint_64t count[2];
-    uint_64t hash[8];
-    uint_64t wbuf[16];
-} sha512_ctx;
-
-typedef sha512_ctx  sha384_ctx;
-
-typedef struct
-{   union
-    { sha256_ctx  ctx256[1];
-      sha512_ctx  ctx512[1];
-    } uu[1];
-    uint_32t    sha2_len;
-} sha2_ctx;
-
-VOID_RETURN sha512_compile(sha512_ctx ctx[1]);
-
-VOID_RETURN sha384_begin(sha384_ctx ctx[1]);
-#define sha384_hash sha512_hash
-VOID_RETURN sha384_end(unsigned char hval[], sha384_ctx ctx[1]);
-VOID_RETURN sha384(unsigned char hval[], const unsigned char data[], unsigned long len);
-
-VOID_RETURN sha512_begin(sha512_ctx ctx[1]);
-VOID_RETURN sha512_hash(const unsigned char data[], unsigned long len, sha512_ctx ctx[1]);
-VOID_RETURN sha512_end(unsigned char hval[], sha512_ctx ctx[1]);
-VOID_RETURN sha512(unsigned char hval[], const unsigned char data[], unsigned long len);
-
-INT_RETURN  sha2_begin(unsigned long size, sha2_ctx ctx[1]);
-VOID_RETURN sha2_hash(const unsigned char data[], unsigned long len, sha2_ctx ctx[1]);
-VOID_RETURN sha2_end(unsigned char hval[], sha2_ctx ctx[1]);
-INT_RETURN  sha2(unsigned char hval[], unsigned long size, const unsigned char data[], unsigned long len);
-
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+*/
+
+#ifndef _SHA2_H
+#define _SHA2_H
+
+#include "Common/Tcdefs.h"
+#include "Common/Endian.h"
+
+#define SHA_64BIT
+
+/* define the hash functions that you need  */
+#define SHA_2   /* for dynamic hash length  */
+#define SHA_224
+#define SHA_256
+#ifdef SHA_64BIT
+#  define SHA_384
+#  define SHA_512
+#  define NEED_UINT_64T
+#endif
+
+#ifndef EXIT_SUCCESS
+#define EXIT_SUCCESS    0
+#define EXIT_FAILURE    1
+#endif
+
+#define li_64(h) 0x##h##ull
+
+#define VOID_RETURN	void
+#define INT_RETURN	int
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+/* Note that the following function prototypes are the same */
+/* for both the bit and byte oriented implementations.  But */
+/* the length fields are in bytes or bits as is appropriate */
+/* for the version used.  Bit sequences are arrays of bytes */
+/* in which bit sequence indexes increase from the most to  */
+/* the least significant end of each byte                   */
+
+#define SHA224_DIGEST_SIZE  28
+#define SHA224_BLOCK_SIZE   64
+#define SHA256_DIGEST_SIZE  32
+#define SHA256_BLOCK_SIZE   64
+
+/* type to hold the SHA256 (and SHA224) context */
+
+typedef struct
+{   uint_32t count[2];
+    uint_32t hash[8];
+    uint_32t wbuf[16];
+} sha256_ctx;
+
+typedef sha256_ctx  sha224_ctx;
+
+VOID_RETURN sha256_compile(sha256_ctx ctx[1]);
+
+VOID_RETURN sha224_begin(sha224_ctx ctx[1]);
+#define sha224_hash sha256_hash
+VOID_RETURN sha224_end(unsigned char hval[], sha224_ctx ctx[1]);
+VOID_RETURN sha224(unsigned char hval[], const unsigned char data[], unsigned long len);
+
+VOID_RETURN sha256_begin(sha256_ctx ctx[1]);
+VOID_RETURN sha256_hash(const unsigned char data[], unsigned long len, sha256_ctx ctx[1]);
+VOID_RETURN sha256_end(unsigned char hval[], sha256_ctx ctx[1]);
+VOID_RETURN sha256(unsigned char hval[], const unsigned char data[], unsigned long len);
+
+#ifndef SHA_64BIT
+
+typedef struct
+{   union
+    { sha256_ctx  ctx256[1];
+    } uu[1];
+    uint_32t    sha2_len;
+} sha2_ctx;
+
+#define SHA2_MAX_DIGEST_SIZE    SHA256_DIGEST_SIZE
+
+#else
+
+#define SHA384_DIGEST_SIZE  48
+#define SHA384_BLOCK_SIZE  128
+#define SHA512_DIGEST_SIZE  64
+#define SHA512_BLOCK_SIZE  128
+#define SHA2_MAX_DIGEST_SIZE    SHA512_DIGEST_SIZE
+
+/* type to hold the SHA384 (and SHA512) context */
+
+typedef struct
+{   uint_64t count[2];
+    uint_64t hash[8];
+    uint_64t wbuf[16];
+} sha512_ctx;
+
+typedef sha512_ctx  sha384_ctx;
+
+typedef struct
+{   union
+    { sha256_ctx  ctx256[1];
+      sha512_ctx  ctx512[1];
+    } uu[1];
+    uint_32t    sha2_len;
+} sha2_ctx;
+
+VOID_RETURN sha512_compile(sha512_ctx ctx[1]);
+
+VOID_RETURN sha384_begin(sha384_ctx ctx[1]);
+#define sha384_hash sha512_hash
+VOID_RETURN sha384_end(unsigned char hval[], sha384_ctx ctx[1]);
+VOID_RETURN sha384(unsigned char hval[], const unsigned char data[], unsigned long len);
+
+VOID_RETURN sha512_begin(sha512_ctx ctx[1]);
+VOID_RETURN sha512_hash(const unsigned char data[], unsigned long len, sha512_ctx ctx[1]);
+VOID_RETURN sha512_end(unsigned char hval[], sha512_ctx ctx[1]);
+VOID_RETURN sha512(unsigned char hval[], const unsigned char data[], unsigned long len);
+
+INT_RETURN  sha2_begin(unsigned long size, sha2_ctx ctx[1]);
+VOID_RETURN sha2_hash(const unsigned char data[], unsigned long len, sha2_ctx ctx[1]);
+VOID_RETURN sha2_end(unsigned char hval[], sha2_ctx ctx[1]);
+INT_RETURN  sha2(unsigned char hval[], unsigned long size, const unsigned char data[], unsigned long len);
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/src/Crypto/Sha2Small.c b/src/Crypto/Sha2Small.c
index 9acd1b83..539ff05d 100644
--- a/src/Crypto/Sha2Small.c
+++ b/src/Crypto/Sha2Small.c
@@ -10,237 +10,237 @@
  *
  */
 
-/* Adapted for VeraCrypt */
-
-#include <memory.h>
-#include "Common/Tcdefs.h"
-#include "Common/Endian.h"
-#include "Sha2Small.h"
-
-#pragma optimize ("tl", on)
-
-typedef unsigned __int32 uint32;
-typedef unsigned __int8 byte;
-
-#include <stdlib.h>
-#pragma intrinsic(_lrotr)
-#define RORc(x,n) _lrotr(x,n)
-
-/******************************************************************************/
-
-/*
-	The K array
- */
-
-static const uint32 K[64] = {
-	0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL,
-	0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL,
-	0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL,
-	0xc19bf174UL, 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
-	0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, 0x983e5152UL,
-	0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL,
-	0x06ca6351UL, 0x14292967UL, 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL,
-	0x53380d13UL, 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
-	0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL,
-	0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, 0x19a4c116UL, 0x1e376c08UL,
-	0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL,
-	0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
-	0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
-};
-
-/*
-	Various logical functions
- */
-#define Ch(x,y,z)			(z ^ (x & (y ^ z)))
-#define Maj(x,y,z)		(((x | y) & z) | (x & y)) 
-#define S(x, n)			RORc((x),(n))
-#define R(x, n)			((x)>>(n))
-#define Sigma0(x)			(S(x, 2) ^ S(x, 13) ^ S(x, 22))
-#define Sigma1(x)			(S(x, 6) ^ S(x, 11) ^ S(x, 25))
-#define Gamma0(x)			(S(x, 7) ^ S(x, 18) ^ R(x, 3))
-#define Gamma1(x)			(S(x, 17) ^ S(x, 19) ^ R(x, 10))
-
-#define STORE32H(x, y, i) { \
-(y)[i] = (unsigned char)(((x)>>24)); \
-(y)[i+1] = (unsigned char)(((x)>>16)); \
-(y)[i+2] = (unsigned char)(((x)>>8)); \
-(y)[i+3] = (unsigned char)((x)); \
-}
-
-#define LOAD32H(x, y, i) { \
-x = ((unsigned long)((y)[i])<<24) | \
-((unsigned long)((y)[i+1])<<16) | \
-((unsigned long)((y)[i+2])<<8)  | \
-((unsigned long)((y)[i+3])); \
-}
-
-/*
-	compress 512-bits
- */
-static void sha256_compress(sha256_ctx * ctx, unsigned char *buf)
-{
-
-	uint32 S[8], W[64], t0, t1;
-	uint32 t, w2, w15;
-	int i;
-
-/*
-	copy state into S
- */
-	for (i = 0; i < 8; i++) {
-		S[i] = ctx->state[i];
-	}
-
-/*
-	copy the state into 512-bits into W[0..15]
- */
-	for (i = 0; i < 16; i++) {
-		LOAD32H(W[i], buf , (4*i));
-	}
-
-/*
-	fill W[16..63]
- */
-	for (i = 16; i < 64; i++) {
-		w2 = W[i - 2];
-		w15 = W[i - 15];
-		W[i] = Gamma1(w2) + W[i - 7] + Gamma0(w15) + W[i - 16];
-	}
-
-/*
-	Compress
- */
-
-#define RND(a,b,c,d,e,f,g,h,i)							\
-	t0 = h + Sigma1(e) + Ch(e, f, g) + K[i] + W[i];	\
-	t1 = Sigma0(a) + Maj(a, b, c);						\
-	d += t0;											\
-	h  = t0 + t1;
-
-	for (i = 0; i < 64; ++i) {
-		RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i);
-		t = S[7]; S[7] = S[6]; S[6] = S[5]; S[5] = S[4]; 
-		S[4] = S[3]; S[3] = S[2]; S[2] = S[1]; S[1] = S[0]; S[0] = t;
-	}
-
-/*
-	feedback
- */
-	for (i = 0; i < 8; i++) {
-		ctx->state[i] += S[i];
-	}
-
-}
-
-/*
-	init the sha256 state
- */
-VOID_RETURN sha256_begin(sha256_ctx* ctx)
-{
-	ctx->curlen = 0;
-	ctx->state[0] = 0x6A09E667UL;
-	ctx->state[1] = 0xBB67AE85UL;
-	ctx->state[2] = 0x3C6EF372UL;
-	ctx->state[3] = 0xA54FF53AUL;
-	ctx->state[4] = 0x510E527FUL;
-	ctx->state[5] = 0x9B05688CUL;
-	ctx->state[6] = 0x1F83D9ABUL;
-	ctx->state[7] = 0x5BE0CD19UL;
-	ctx->highLength = 0;
-	ctx->lowLength = 0;
-}
-
-VOID_RETURN sha256_hash(unsigned char* data, unsigned int len, sha256_ctx* ctx)
-{
-	uint32 n;
-	while (len > 0) {
-		if (ctx->curlen == 0 && len >= 64) {			
-			sha256_compress(ctx, (unsigned char *)data);
-
-			n = ctx->lowLength + 512;
-			if (n < ctx->lowLength) {
-				ctx->highLength++;
-			}
-			ctx->lowLength = n;
-			data		+= 64;
-			len		-= 64;
-		} else {
-			n = min(len, 64 - ctx->curlen);
-			memcpy(ctx->buf + ctx->curlen, data, (size_t)n);
-			ctx->curlen	+= (unsigned int) n;
-			data			+= (unsigned int) n;
-			len			-= (unsigned int) n;
-
-			if (ctx->curlen == 64) {
-				sha256_compress (ctx, ctx->buf);
-
-				n = ctx->lowLength + 512;
-				if (n < ctx->lowLength) {
-					ctx->highLength++;
-				}
-				ctx->lowLength = n;		
-				ctx->curlen	= 0;
-			}
-		}
-	}
-	return;
-}
-
-VOID_RETURN sha256_end(unsigned char* hval, sha256_ctx* ctx)
-{
-	int i;
-	uint32	n;
-
-/*
-	increase the length of the message
- */
-
-	n = ctx->lowLength + (ctx->curlen << 3);
-	if (n < ctx->lowLength) {
-		ctx->highLength++;
-	}
-	ctx->highLength += (ctx->curlen >> 29);
-	ctx->lowLength = n;
-
-/*
-	append the '1' bit
- */
-	ctx->buf[ctx->curlen++] = (unsigned char)0x80;
-
-/*
-	if the length is currently above 56 bytes we append zeros then compress.
-	Then we can fall back to padding zeros and length encoding like normal.
- */
-	if (ctx->curlen > 56) {
-		while (ctx->curlen < 64) {
-			ctx->buf[ctx->curlen++] = (unsigned char)0;
-		}
-		sha256_compress(ctx, ctx->buf);
-		ctx->curlen = 0;
-	}
-
-/*
-	pad upto 56 bytes of zeroes
- */
-	while (ctx->curlen < 56) {
-		ctx->buf[ctx->curlen++] = (unsigned char)0;
-	}
-
-/*
-	store length
- */
-
-	STORE32H(ctx->highLength, ctx->buf, 56);
-	STORE32H(ctx->lowLength, ctx->buf, 60);
-	
-	sha256_compress(ctx, ctx->buf);
-
-/*
-	copy output
- */
-	for (i = 0; i < 8; i++) {
-		STORE32H(ctx->state[i], hval, (4*i));
-	}
-}
-
-/******************************************************************************/
+/* Adapted for VeraCrypt */
+
+#include <memory.h>
+#include "Common/Tcdefs.h"
+#include "Common/Endian.h"
+#include "Sha2Small.h"
+
+#pragma optimize ("tl", on)
+
+typedef unsigned __int32 uint32;
+typedef unsigned __int8 byte;
+
+#include <stdlib.h>
+#pragma intrinsic(_lrotr)
+#define RORc(x,n) _lrotr(x,n)
+
+/******************************************************************************/
+
+/*
+	The K array
+ */
+
+static const uint32 K[64] = {
+	0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL,
+	0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL,
+	0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL,
+	0xc19bf174UL, 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
+	0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, 0x983e5152UL,
+	0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL,
+	0x06ca6351UL, 0x14292967UL, 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL,
+	0x53380d13UL, 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
+	0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL,
+	0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, 0x19a4c116UL, 0x1e376c08UL,
+	0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL,
+	0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
+	0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
+};
+
+/*
+	Various logical functions
+ */
+#define Ch(x,y,z)			(z ^ (x & (y ^ z)))
+#define Maj(x,y,z)		(((x | y) & z) | (x & y)) 
+#define S(x, n)			RORc((x),(n))
+#define R(x, n)			((x)>>(n))
+#define Sigma0(x)			(S(x, 2) ^ S(x, 13) ^ S(x, 22))
+#define Sigma1(x)			(S(x, 6) ^ S(x, 11) ^ S(x, 25))
+#define Gamma0(x)			(S(x, 7) ^ S(x, 18) ^ R(x, 3))
+#define Gamma1(x)			(S(x, 17) ^ S(x, 19) ^ R(x, 10))
+
+#define STORE32H(x, y, i) { \
+(y)[i] = (unsigned char)(((x)>>24)); \
+(y)[i+1] = (unsigned char)(((x)>>16)); \
+(y)[i+2] = (unsigned char)(((x)>>8)); \
+(y)[i+3] = (unsigned char)((x)); \
+}
+
+#define LOAD32H(x, y, i) { \
+x = ((unsigned long)((y)[i])<<24) | \
+((unsigned long)((y)[i+1])<<16) | \
+((unsigned long)((y)[i+2])<<8)  | \
+((unsigned long)((y)[i+3])); \
+}
+
+/*
+	compress 512-bits
+ */
+static void sha256_compress(sha256_ctx * ctx, unsigned char *buf)
+{
+
+	uint32 S[8], W[64], t0, t1;
+	uint32 t, w2, w15;
+	int i;
+
+/*
+	copy state into S
+ */
+	for (i = 0; i < 8; i++) {
+		S[i] = ctx->state[i];
+	}
+
+/*
+	copy the state into 512-bits into W[0..15]
+ */
+	for (i = 0; i < 16; i++) {
+		LOAD32H(W[i], buf , (4*i));
+	}
+
+/*
+	fill W[16..63]
+ */
+	for (i = 16; i < 64; i++) {
+		w2 = W[i - 2];
+		w15 = W[i - 15];
+		W[i] = Gamma1(w2) + W[i - 7] + Gamma0(w15) + W[i - 16];
+	}
+
+/*
+	Compress
+ */
+
+#define RND(a,b,c,d,e,f,g,h,i)							\
+	t0 = h + Sigma1(e) + Ch(e, f, g) + K[i] + W[i];	\
+	t1 = Sigma0(a) + Maj(a, b, c);						\
+	d += t0;											\
+	h  = t0 + t1;
+
+	for (i = 0; i < 64; ++i) {
+		RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i);
+		t = S[7]; S[7] = S[6]; S[6] = S[5]; S[5] = S[4]; 
+		S[4] = S[3]; S[3] = S[2]; S[2] = S[1]; S[1] = S[0]; S[0] = t;
+	}
+
+/*
+	feedback
+ */
+	for (i = 0; i < 8; i++) {
+		ctx->state[i] += S[i];
+	}
+
+}
+
+/*
+	init the sha256 state
+ */
+VOID_RETURN sha256_begin(sha256_ctx* ctx)
+{
+	ctx->curlen = 0;
+	ctx->state[0] = 0x6A09E667UL;
+	ctx->state[1] = 0xBB67AE85UL;
+	ctx->state[2] = 0x3C6EF372UL;
+	ctx->state[3] = 0xA54FF53AUL;
+	ctx->state[4] = 0x510E527FUL;
+	ctx->state[5] = 0x9B05688CUL;
+	ctx->state[6] = 0x1F83D9ABUL;
+	ctx->state[7] = 0x5BE0CD19UL;
+	ctx->highLength = 0;
+	ctx->lowLength = 0;
+}
+
+VOID_RETURN sha256_hash(unsigned char* data, unsigned int len, sha256_ctx* ctx)
+{
+	uint32 n;
+	while (len > 0) {
+		if (ctx->curlen == 0 && len >= 64) {			
+			sha256_compress(ctx, (unsigned char *)data);
+
+			n = ctx->lowLength + 512;
+			if (n < ctx->lowLength) {
+				ctx->highLength++;
+			}
+			ctx->lowLength = n;
+			data		+= 64;
+			len		-= 64;
+		} else {
+			n = min(len, 64 - ctx->curlen);
+			memcpy(ctx->buf + ctx->curlen, data, (size_t)n);
+			ctx->curlen	+= (unsigned int) n;
+			data			+= (unsigned int) n;
+			len			-= (unsigned int) n;
+
+			if (ctx->curlen == 64) {
+				sha256_compress (ctx, ctx->buf);
+
+				n = ctx->lowLength + 512;
+				if (n < ctx->lowLength) {
+					ctx->highLength++;
+				}
+				ctx->lowLength = n;		
+				ctx->curlen	= 0;
+			}
+		}
+	}
+	return;
+}
+
+VOID_RETURN sha256_end(unsigned char* hval, sha256_ctx* ctx)
+{
+	int i;
+	uint32	n;
+
+/*
+	increase the length of the message
+ */
+
+	n = ctx->lowLength + (ctx->curlen << 3);
+	if (n < ctx->lowLength) {
+		ctx->highLength++;
+	}
+	ctx->highLength += (ctx->curlen >> 29);
+	ctx->lowLength = n;
+
+/*
+	append the '1' bit
+ */
+	ctx->buf[ctx->curlen++] = (unsigned char)0x80;
+
+/*
+	if the length is currently above 56 bytes we append zeros then compress.
+	Then we can fall back to padding zeros and length encoding like normal.
+ */
+	if (ctx->curlen > 56) {
+		while (ctx->curlen < 64) {
+			ctx->buf[ctx->curlen++] = (unsigned char)0;
+		}
+		sha256_compress(ctx, ctx->buf);
+		ctx->curlen = 0;
+	}
+
+/*
+	pad upto 56 bytes of zeroes
+ */
+	while (ctx->curlen < 56) {
+		ctx->buf[ctx->curlen++] = (unsigned char)0;
+	}
+
+/*
+	store length
+ */
+
+	STORE32H(ctx->highLength, ctx->buf, 56);
+	STORE32H(ctx->lowLength, ctx->buf, 60);
+	
+	sha256_compress(ctx, ctx->buf);
+
+/*
+	copy output
+ */
+	for (i = 0; i < 8; i++) {
+		STORE32H(ctx->state[i], hval, (4*i));
+	}
+}
+
+/******************************************************************************/
diff --git a/src/Crypto/Sha2Small.h b/src/Crypto/Sha2Small.h
index 2b79eaf4..1b5c106e 100644
--- a/src/Crypto/Sha2Small.h
+++ b/src/Crypto/Sha2Small.h
@@ -12,21 +12,21 @@
 
 /* Adapted for VeraCrypt */
 
-#ifndef _SHA2_SMALL_H
+#ifndef _SHA2_SMALL_H
 #define _SHA2_SMALL_H
 
-#include "Common/Tcdefs.h"
+#include "Common/Tcdefs.h"
 #include "Common/Endian.h"
 
-#define SHA256_DIGEST_SIZE  32
+#define SHA256_DIGEST_SIZE  32
 #define SHA256_BLOCK_SIZE   64
 
-#define VOID_RETURN	void
-#define INT_RETURN	int
-
-#if defined(__cplusplus)
-extern "C"
-{
+#define VOID_RETURN	void
+#define INT_RETURN	int
+
+#if defined(__cplusplus)
+extern "C"
+{
 #endif
 
 typedef struct {
@@ -40,12 +40,12 @@ typedef struct {
 
 /******************************************************************************/
 
-VOID_RETURN sha256_begin(sha256_ctx* ctx);
-VOID_RETURN sha256_hash(unsigned char* data, unsigned int len, sha256_ctx* ctx);
+VOID_RETURN sha256_begin(sha256_ctx* ctx);
+VOID_RETURN sha256_hash(unsigned char* data, unsigned int len, sha256_ctx* ctx);
 VOID_RETURN sha256_end(unsigned char* hval, sha256_ctx* ctx);
 
-#if defined(__cplusplus)
-}
+#if defined(__cplusplus)
+}
 #endif
 
 /******************************************************************************/
diff --git a/src/Crypto/Sources b/src/Crypto/Sources
index 9b1b988c..6eb7b7b4 100644
--- a/src/Crypto/Sources
+++ b/src/Crypto/Sources
@@ -1,20 +1,20 @@
-TARGETNAME=Crypto
-TARGETTYPE=DRIVER_LIBRARY
-
-INCLUDES = ..
-
-NTTARGETFILES = \
-	"$(OBJ_PATH)\$(O)\Aes_$(TC_ARCH).obj" \
-	"$(OBJ_PATH)\$(O)\Aes_hw_cpu.obj"
-
-SOURCES = \
-	Aes_$(TC_ARCH).asm \
-	Aes_hw_cpu.asm \
-	Aeskey.c \
-	Aestab.c \
-	cpu.c \
-	Rmd160.c \
-	Serpent.c \
-	Sha2.c \
-	Twofish.c \
-	Whirlpool.c
+TARGETNAME=Crypto
+TARGETTYPE=DRIVER_LIBRARY
+
+INCLUDES = ..
+
+NTTARGETFILES = \
+	"$(OBJ_PATH)\$(O)\Aes_$(TC_ARCH).obj" \
+	"$(OBJ_PATH)\$(O)\Aes_hw_cpu.obj"
+
+SOURCES = \
+	Aes_$(TC_ARCH).asm \
+	Aes_hw_cpu.asm \
+	Aeskey.c \
+	Aestab.c \
+	cpu.c \
+	Rmd160.c \
+	Serpent.c \
+	Sha2.c \
+	Twofish.c \
+	Whirlpool.c
diff --git a/src/Crypto/Twofish.c b/src/Crypto/Twofish.c
index 2273ac5e..7c58c91e 100644
--- a/src/Crypto/Twofish.c
+++ b/src/Crypto/Twofish.c
@@ -1,549 +1,549 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1999, Dr Brian Gladman, Worcester, UK.   All rights reserved.
-
- LICENSE TERMS
-
- The free distribution and use of this software is allowed (with or without
- changes) provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
-
- My thanks to Doug Whiting and Niels Ferguson for comments that led
- to improvements in this implementation.
-
- Issue Date: 14th January 1999
-*/
-
-/* Adapted for TrueCrypt */
-/* Adapted for VeraCrypt */
-
-
-#ifdef TC_WINDOWS_BOOT
-#pragma optimize ("tl", on)
-#endif
-
-#include "Twofish.h"
-#include "Common/Endian.h"
-
-#define Q_TABLES
-#define M_TABLE
-
-#if !defined (TC_MINIMIZE_CODE_SIZE) || defined (TC_WINDOWS_BOOT_TWOFISH)
-#	define MK_TABLE
-#	define ONE_STEP
-#endif
-
-/* finite field arithmetic for GF(2**8) with the modular    */
-/* polynomial x^8 + x^6 + x^5 + x^3 + 1 (0x169)             */
-
-#define G_M 0x0169
-
-static u1byte  tab_5b[4] = { 0, G_M >> 2, G_M >> 1, (G_M >> 1) ^ (G_M >> 2) };
-static u1byte  tab_ef[4] = { 0, (G_M >> 1) ^ (G_M >> 2), G_M >> 1, G_M >> 2 };
-
-#define ffm_01(x)    (x)
-#define ffm_5b(x)   ((x) ^ ((x) >> 2) ^ tab_5b[(x) & 3])
-#define ffm_ef(x)   ((x) ^ ((x) >> 1) ^ ((x) >> 2) ^ tab_ef[(x) & 3])
-
-static u1byte ror4[16] = { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
-static u1byte ashx[16] = { 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12, 5, 14, 7 };
-
-static u1byte qt0[2][16] = 
-{   { 8, 1, 7, 13, 6, 15, 3, 2, 0, 11, 5, 9, 14, 12, 10, 4 },
-    { 2, 8, 11, 13, 15, 7, 6, 14, 3, 1, 9, 4, 0, 10, 12, 5 }
-};
-
-static u1byte qt1[2][16] =
-{   { 14, 12, 11, 8, 1, 2, 3, 5, 15, 4, 10, 6, 7, 0, 9, 13 }, 
-    { 1, 14, 2, 11, 4, 12, 3, 7, 6, 13, 10, 5, 15, 9, 0, 8 }
-};
-
-static u1byte qt2[2][16] = 
-{   { 11, 10, 5, 14, 6, 13, 9, 0, 12, 8, 15, 3, 2, 4, 7, 1 },
-    { 4, 12, 7, 5, 1, 6, 9, 10, 0, 14, 13, 8, 2, 11, 3, 15 }
-};
-
-static u1byte qt3[2][16] = 
-{   { 13, 7, 15, 4, 1, 2, 6, 14, 9, 11, 3, 0, 8, 5, 12, 10 },
-    { 11, 9, 5, 1, 12, 3, 13, 14, 6, 4, 7, 15, 2, 0, 8, 10 }
-};
- 
-static u1byte qp(const u4byte n, const u1byte x)
-{   u1byte  a0, a1, a2, a3, a4, b0, b1, b2, b3, b4;
-
-    a0 = x >> 4; b0 = x & 15;
-    a1 = a0 ^ b0; b1 = ror4[b0] ^ ashx[a0];
-    a2 = qt0[n][a1]; b2 = qt1[n][b1];
-    a3 = a2 ^ b2; b3 = ror4[b2] ^ ashx[a2];
-    a4 = qt2[n][a3]; b4 = qt3[n][b3];
-    return (b4 << 4) | a4;
-};
-
-#ifdef  Q_TABLES
-
-static u4byte  qt_gen = 0;
-static u1byte  q_tab[2][256];
-
-#define q(n,x)  q_tab[n][x]
-
-static void gen_qtab(void)
-{   u4byte  i;
-
-    for(i = 0; i < 256; ++i)
-    {       
-        q(0,i) = qp(0, (u1byte)i);
-        q(1,i) = qp(1, (u1byte)i);
-    }
-};
-
-#else
-
-#define q(n,x)  qp(n, x)
-
-#endif
-
-#ifdef  M_TABLE
-
-static u4byte  mt_gen = 0;
-static u4byte  m_tab[4][256];
-
-static void gen_mtab(void)
-{   u4byte  i, f01, f5b, fef;
-    
-    for(i = 0; i < 256; ++i)
-    {
-        f01 = q(1,i); f5b = ffm_5b(f01); fef = ffm_ef(f01);
-        m_tab[0][i] = f01 + (f5b << 8) + (fef << 16) + (fef << 24);
-        m_tab[2][i] = f5b + (fef << 8) + (f01 << 16) + (fef << 24);
-
-        f01 = q(0,i); f5b = ffm_5b(f01); fef = ffm_ef(f01);
-        m_tab[1][i] = fef + (fef << 8) + (f5b << 16) + (f01 << 24);
-        m_tab[3][i] = f5b + (f01 << 8) + (fef << 16) + (f5b << 24);
-    }
-};
-
-#define mds(n,x)    m_tab[n][x]
-
-#else
-
-#define fm_00   ffm_01
-#define fm_10   ffm_5b
-#define fm_20   ffm_ef
-#define fm_30   ffm_ef
-#define q_0(x)  q(1,x)
-
-#define fm_01   ffm_ef
-#define fm_11   ffm_ef
-#define fm_21   ffm_5b
-#define fm_31   ffm_01
-#define q_1(x)  q(0,x)
-
-#define fm_02   ffm_5b
-#define fm_12   ffm_ef
-#define fm_22   ffm_01
-#define fm_32   ffm_ef
-#define q_2(x)  q(1,x)
-
-#define fm_03   ffm_5b
-#define fm_13   ffm_01
-#define fm_23   ffm_ef
-#define fm_33   ffm_5b
-#define q_3(x)  q(0,x)
-
-#define f_0(n,x)    ((u4byte)fm_0##n(x))
-#define f_1(n,x)    ((u4byte)fm_1##n(x) << 8)
-#define f_2(n,x)    ((u4byte)fm_2##n(x) << 16)
-#define f_3(n,x)    ((u4byte)fm_3##n(x) << 24)
-
-#define mds(n,x)    f_0(n,q_##n(x)) ^ f_1(n,q_##n(x)) ^ f_2(n,q_##n(x)) ^ f_3(n,q_##n(x))
-
-#endif
-
-static u4byte h_fun(TwofishInstance *instance, const u4byte x, const u4byte key[])
-{   u4byte  b0, b1, b2, b3;
-
-#ifndef M_TABLE
-    u4byte  m5b_b0, m5b_b1, m5b_b2, m5b_b3;
-    u4byte  mef_b0, mef_b1, mef_b2, mef_b3;
-#endif
-
-    b0 = extract_byte(x, 0); b1 = extract_byte(x, 1); b2 = extract_byte(x, 2); b3 = extract_byte(x, 3);
-
-    switch(instance->k_len)
-    {
-    case 4: b0 = q(1, (u1byte) b0) ^ extract_byte(key[3],0);
-            b1 = q(0, (u1byte) b1) ^ extract_byte(key[3],1);
-            b2 = q(0, (u1byte) b2) ^ extract_byte(key[3],2);
-            b3 = q(1, (u1byte) b3) ^ extract_byte(key[3],3);
-    case 3: b0 = q(1, (u1byte) b0) ^ extract_byte(key[2],0);
-            b1 = q(1, (u1byte) b1) ^ extract_byte(key[2],1);
-            b2 = q(0, (u1byte) b2) ^ extract_byte(key[2],2);
-            b3 = q(0, (u1byte) b3) ^ extract_byte(key[2],3);
-    case 2: b0 = q(0, (u1byte) (q(0, (u1byte) b0) ^ extract_byte(key[1],0))) ^ extract_byte(key[0],0);
-            b1 = q(0, (u1byte) (q(1, (u1byte) b1) ^ extract_byte(key[1],1))) ^ extract_byte(key[0],1);
-            b2 = q(1, (u1byte) (q(0, (u1byte) b2) ^ extract_byte(key[1],2))) ^ extract_byte(key[0],2);
-            b3 = q(1, (u1byte) (q(1, (u1byte) b3) ^ extract_byte(key[1],3))) ^ extract_byte(key[0],3);
-    }
-#ifdef  M_TABLE
-
-    return  mds(0, b0) ^ mds(1, b1) ^ mds(2, b2) ^ mds(3, b3);
-
-#else
-
-    b0 = q(1, (u1byte) b0); b1 = q(0, (u1byte) b1); b2 = q(1, (u1byte) b2); b3 = q(0, (u1byte) b3);
-    m5b_b0 = ffm_5b(b0); m5b_b1 = ffm_5b(b1); m5b_b2 = ffm_5b(b2); m5b_b3 = ffm_5b(b3);
-    mef_b0 = ffm_ef(b0); mef_b1 = ffm_ef(b1); mef_b2 = ffm_ef(b2); mef_b3 = ffm_ef(b3);
-    b0 ^= mef_b1 ^ m5b_b2 ^ m5b_b3; b3 ^= m5b_b0 ^ mef_b1 ^ mef_b2;
-    b2 ^= mef_b0 ^ m5b_b1 ^ mef_b3; b1 ^= mef_b0 ^ mef_b2 ^ m5b_b3;
-
-    return b0 | (b3 << 8) | (b2 << 16) | (b1 << 24);
-
-#endif
-};
-
-#ifdef  MK_TABLE
-
-#ifdef  ONE_STEP
-//u4byte  mk_tab[4][256];
-#else
-static u1byte  sb[4][256];
-#endif
-
-#define q20(x)  q(0,q(0,x) ^ extract_byte(key[1],0)) ^ extract_byte(key[0],0)
-#define q21(x)  q(0,q(1,x) ^ extract_byte(key[1],1)) ^ extract_byte(key[0],1)
-#define q22(x)  q(1,q(0,x) ^ extract_byte(key[1],2)) ^ extract_byte(key[0],2)
-#define q23(x)  q(1,q(1,x) ^ extract_byte(key[1],3)) ^ extract_byte(key[0],3)
-
-#define q30(x)  q(0,q(0,q(1, x) ^ extract_byte(key[2],0)) ^ extract_byte(key[1],0)) ^ extract_byte(key[0],0)
-#define q31(x)  q(0,q(1,q(1, x) ^ extract_byte(key[2],1)) ^ extract_byte(key[1],1)) ^ extract_byte(key[0],1)
-#define q32(x)  q(1,q(0,q(0, x) ^ extract_byte(key[2],2)) ^ extract_byte(key[1],2)) ^ extract_byte(key[0],2)
-#define q33(x)  q(1,q(1,q(0, x) ^ extract_byte(key[2],3)) ^ extract_byte(key[1],3)) ^ extract_byte(key[0],3)
-
-#define q40(x)  q(0,q(0,q(1, q(1, x) ^ extract_byte(key[3],0)) ^ extract_byte(key[2],0)) ^ extract_byte(key[1],0)) ^ extract_byte(key[0],0)
-#define q41(x)  q(0,q(1,q(1, q(0, x) ^ extract_byte(key[3],1)) ^ extract_byte(key[2],1)) ^ extract_byte(key[1],1)) ^ extract_byte(key[0],1)
-#define q42(x)  q(1,q(0,q(0, q(0, x) ^ extract_byte(key[3],2)) ^ extract_byte(key[2],2)) ^ extract_byte(key[1],2)) ^ extract_byte(key[0],2)
-#define q43(x)  q(1,q(1,q(0, q(1, x) ^ extract_byte(key[3],3)) ^ extract_byte(key[2],3)) ^ extract_byte(key[1],3)) ^ extract_byte(key[0],3)
-
-static void gen_mk_tab(TwofishInstance *instance, u4byte key[])
-{   u4byte  i;
-    u1byte  by;
-
-	u4byte *mk_tab = instance->mk_tab;
-
-    switch(instance->k_len)
-    {
-    case 2: for(i = 0; i < 256; ++i)
-            {
-                by = (u1byte)i;
-#ifdef ONE_STEP
-                mk_tab[0 + 4*i] = mds(0, q20(by)); mk_tab[1 + 4*i] = mds(1, q21(by));
-                mk_tab[2 + 4*i] = mds(2, q22(by)); mk_tab[3 + 4*i] = mds(3, q23(by));
-#else
-                sb[0][i] = q20(by); sb[1][i] = q21(by); 
-                sb[2][i] = q22(by); sb[3][i] = q23(by);
-#endif
-            }
-            break;
-    
-    case 3: for(i = 0; i < 256; ++i)
-            {
-                by = (u1byte)i;
-#ifdef ONE_STEP
-                mk_tab[0 + 4*i] = mds(0, q30(by)); mk_tab[1 + 4*i] = mds(1, q31(by));
-                mk_tab[2 + 4*i] = mds(2, q32(by)); mk_tab[3 + 4*i] = mds(3, q33(by));
-#else
-                sb[0][i] = q30(by); sb[1][i] = q31(by); 
-                sb[2][i] = q32(by); sb[3][i] = q33(by);
-#endif
-            }
-            break;
-    
-    case 4: for(i = 0; i < 256; ++i)
-            {
-                by = (u1byte)i;
-#ifdef ONE_STEP
-                mk_tab[0 + 4*i] = mds(0, q40(by)); mk_tab[1 + 4*i] = mds(1, q41(by));
-                mk_tab[2 + 4*i] = mds(2, q42(by)); mk_tab[3 + 4*i] = mds(3, q43(by));
-#else
-                sb[0][i] = q40(by); sb[1][i] = q41(by); 
-                sb[2][i] = q42(by); sb[3][i] = q43(by);
-#endif
-            }
-    }
-};
-
-#  ifdef ONE_STEP
-#    define g0_fun(x) ( mk_tab[0 + 4*extract_byte(x,0)] ^ mk_tab[1 + 4*extract_byte(x,1)] \
-                      ^ mk_tab[2 + 4*extract_byte(x,2)] ^ mk_tab[3 + 4*extract_byte(x,3)] )
-#    define g1_fun(x) ( mk_tab[0 + 4*extract_byte(x,3)] ^ mk_tab[1 + 4*extract_byte(x,0)] \
-                      ^ mk_tab[2 + 4*extract_byte(x,1)] ^ mk_tab[3 + 4*extract_byte(x,2)] )
-
-
-#  else
-#    define g0_fun(x) ( mds(0, sb[0][extract_byte(x,0)]) ^ mds(1, sb[1][extract_byte(x,1)]) \
-                      ^ mds(2, sb[2][extract_byte(x,2)]) ^ mds(3, sb[3][extract_byte(x,3)]) )
-#    define g1_fun(x) ( mds(0, sb[0][extract_byte(x,3)]) ^ mds(1, sb[1][extract_byte(x,0)]) \
-                      ^ mds(2, sb[2][extract_byte(x,1)]) ^ mds(3, sb[3][extract_byte(x,2)]) )
-#  endif
-
-#else
-
-#define g0_fun(x)   h_fun(instance, x, instance->s_key)
-#define g1_fun(x)   h_fun(instance, rotl(x,8), instance->s_key)
-
-#endif
-
-/* The (12,8) Reed Soloman code has the generator polynomial
-
-  g(x) = x^4 + (a + 1/a) * x^3 + a * x^2 + (a + 1/a) * x + 1
-
-where the coefficients are in the finite field GF(2^8) with a
-modular polynomial a^8 + a^6 + a^3 + a^2 + 1. To generate the
-remainder we have to start with a 12th order polynomial with our
-eight input bytes as the coefficients of the 4th to 11th terms. 
-That is:
-
-  m[7] * x^11 + m[6] * x^10 ... + m[0] * x^4 + 0 * x^3 +... + 0
-  
-We then multiply the generator polynomial by m[7] * x^7 and subtract
-it - xor in GF(2^8) - from the above to eliminate the x^7 term (the 
-artihmetic on the coefficients is done in GF(2^8). We then multiply 
-the generator polynomial by x^6 * coeff(x^10) and use this to remove
-the x^10 term. We carry on in this way until the x^4 term is removed
-so that we are left with:
-
-  r[3] * x^3 + r[2] * x^2 + r[1] 8 x^1 + r[0]
-
-which give the resulting 4 bytes of the remainder. This is equivalent 
-to the matrix multiplication in the Twofish description but much faster 
-to implement.
-
-*/
-
-#define G_MOD   0x0000014d
-
-static u4byte mds_rem(u4byte p0, u4byte p1)
-{   u4byte  i, t, u;
-
-    for(i = 0; i < 8; ++i)
-    {
-        t = p1 >> 24;   // get most significant coefficient
-        
-        p1 = (p1 << 8) | (p0 >> 24); p0 <<= 8;  // shift others up
-            
-        // multiply t by a (the primitive element - i.e. left shift)
-
-        u = (t << 1); 
-        
-        if(t & 0x80)            // subtract modular polynomial on overflow
-        
-            u ^= G_MOD; 
-
-        p1 ^= t ^ (u << 16);    // remove t * (a * x^2 + 1)  
-
-        u ^= (t >> 1);          // form u = a * t + t / a = t * (a + 1 / a); 
-        
-        if(t & 0x01)            // add the modular polynomial on underflow
-        
-            u ^= G_MOD >> 1;
-
-        p1 ^= (u << 24) | (u << 8); // remove t * (a + 1/a) * (x^3 + x)
-    }
-
-    return p1;
-};
-
-/* initialise the key schedule from the user supplied key   */
-
-u4byte *twofish_set_key(TwofishInstance *instance, const u4byte in_key[])
-{   u4byte  i, a, b, me_key[4], mo_key[4];
-	u4byte *l_key, *s_key;
-
-	l_key = instance->l_key;
-	s_key = instance->s_key;
-
-#ifdef Q_TABLES
-    if(!qt_gen)
-    {
-        gen_qtab(); qt_gen = 1;
-    }
-#endif
-
-#ifdef M_TABLE
-    if(!mt_gen)
-    {
-        gen_mtab(); mt_gen = 1;
-    }
-#endif
-
-    instance->k_len = 4;
-
-    for(i = 0; i < instance->k_len; ++i)
-    {
-        a = LE32(in_key[i + i]);     me_key[i] = a;
-        b = LE32(in_key[i + i + 1]); mo_key[i] = b;
-        s_key[instance->k_len - i - 1] = mds_rem(a, b);
-    }
-
-    for(i = 0; i < 40; i += 2)
-    {
-        a = 0x01010101 * i; b = a + 0x01010101;
-        a = h_fun(instance, a, me_key);
-        b = rotl(h_fun(instance, b, mo_key), 8);
-        l_key[i] = a + b;
-        l_key[i + 1] = rotl(a + 2 * b, 9);
-    }
-
-#ifdef MK_TABLE
-    gen_mk_tab(instance, s_key);
-#endif
-
-    return l_key;
-};
-
-/* encrypt a block of text  */
-
-#ifndef TC_MINIMIZE_CODE_SIZE
-
-#define f_rnd(i)                                                    \
-    t1 = g1_fun(blk[1]); t0 = g0_fun(blk[0]);                       \
-    blk[2] = rotr(blk[2] ^ (t0 + t1 + l_key[4 * (i) + 8]), 1);      \
-    blk[3] = rotl(blk[3], 1) ^ (t0 + 2 * t1 + l_key[4 * (i) + 9]);  \
-    t1 = g1_fun(blk[3]); t0 = g0_fun(blk[2]);                       \
-    blk[0] = rotr(blk[0] ^ (t0 + t1 + l_key[4 * (i) + 10]), 1);     \
-    blk[1] = rotl(blk[1], 1) ^ (t0 + 2 * t1 + l_key[4 * (i) + 11])
-
-void twofish_encrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[])
-{   u4byte  t0, t1, blk[4];
-
-	u4byte *l_key = instance->l_key;
-	u4byte *mk_tab = instance->mk_tab;
-
-	blk[0] = LE32(in_blk[0]) ^ l_key[0];
-    blk[1] = LE32(in_blk[1]) ^ l_key[1];
-    blk[2] = LE32(in_blk[2]) ^ l_key[2];
-    blk[3] = LE32(in_blk[3]) ^ l_key[3];
-
-    f_rnd(0); f_rnd(1); f_rnd(2); f_rnd(3);
-    f_rnd(4); f_rnd(5); f_rnd(6); f_rnd(7);
-
-    out_blk[0] = LE32(blk[2] ^ l_key[4]);
-    out_blk[1] = LE32(blk[3] ^ l_key[5]);
-    out_blk[2] = LE32(blk[0] ^ l_key[6]);
-    out_blk[3] = LE32(blk[1] ^ l_key[7]); 
-};
-
-#else // TC_MINIMIZE_CODE_SIZE
-
-void twofish_encrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[])
-{   u4byte  t0, t1, blk[4];
-
-	u4byte *l_key = instance->l_key;
-#ifdef TC_WINDOWS_BOOT_TWOFISH
-	u4byte *mk_tab = instance->mk_tab;
-#endif
-	int i;
-
-	blk[0] = LE32(in_blk[0]) ^ l_key[0];
-    blk[1] = LE32(in_blk[1]) ^ l_key[1];
-    blk[2] = LE32(in_blk[2]) ^ l_key[2];
-    blk[3] = LE32(in_blk[3]) ^ l_key[3];
-
-	for (i = 0; i <= 7; ++i)
-	{
-		t1 = g1_fun(blk[1]); t0 = g0_fun(blk[0]);
-		blk[2] = rotr(blk[2] ^ (t0 + t1 + l_key[4 * (i) + 8]), 1);
-		blk[3] = rotl(blk[3], 1) ^ (t0 + 2 * t1 + l_key[4 * (i) + 9]);
-		t1 = g1_fun(blk[3]); t0 = g0_fun(blk[2]);
-		blk[0] = rotr(blk[0] ^ (t0 + t1 + l_key[4 * (i) + 10]), 1);
-		blk[1] = rotl(blk[1], 1) ^ (t0 + 2 * t1 + l_key[4 * (i) + 11]);
-	}
-
-    out_blk[0] = LE32(blk[2] ^ l_key[4]);
-    out_blk[1] = LE32(blk[3] ^ l_key[5]);
-    out_blk[2] = LE32(blk[0] ^ l_key[6]);
-    out_blk[3] = LE32(blk[1] ^ l_key[7]); 
-};
-
-#endif // TC_MINIMIZE_CODE_SIZE
-
-/* decrypt a block of text  */
-
-#ifndef TC_MINIMIZE_CODE_SIZE
-
-#define i_rnd(i)                                                        \
-        t1 = g1_fun(blk[1]); t0 = g0_fun(blk[0]);                       \
-        blk[2] = rotl(blk[2], 1) ^ (t0 + t1 + l_key[4 * (i) + 10]);     \
-        blk[3] = rotr(blk[3] ^ (t0 + 2 * t1 + l_key[4 * (i) + 11]), 1); \
-        t1 = g1_fun(blk[3]); t0 = g0_fun(blk[2]);                       \
-        blk[0] = rotl(blk[0], 1) ^ (t0 + t1 + l_key[4 * (i) +  8]);     \
-        blk[1] = rotr(blk[1] ^ (t0 + 2 * t1 + l_key[4 * (i) +  9]), 1)
-
-void twofish_decrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[4])
-{   u4byte  t0, t1, blk[4];
-
-	u4byte *l_key = instance->l_key;
-	u4byte *mk_tab = instance->mk_tab;
-
-    blk[0] = LE32(in_blk[0]) ^ l_key[4];
-    blk[1] = LE32(in_blk[1]) ^ l_key[5];
-    blk[2] = LE32(in_blk[2]) ^ l_key[6];
-    blk[3] = LE32(in_blk[3]) ^ l_key[7];
-
-    i_rnd(7); i_rnd(6); i_rnd(5); i_rnd(4);
-    i_rnd(3); i_rnd(2); i_rnd(1); i_rnd(0);
-
-    out_blk[0] = LE32(blk[2] ^ l_key[0]);
-    out_blk[1] = LE32(blk[3] ^ l_key[1]);
-    out_blk[2] = LE32(blk[0] ^ l_key[2]);
-    out_blk[3] = LE32(blk[1] ^ l_key[3]); 
-};
-
-#else // TC_MINIMIZE_CODE_SIZE
-
-void twofish_decrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[4])
-{   u4byte  t0, t1, blk[4];
-
-	u4byte *l_key = instance->l_key;
-#ifdef TC_WINDOWS_BOOT_TWOFISH
-	u4byte *mk_tab = instance->mk_tab;
-#endif
-	int i;
-
-    blk[0] = LE32(in_blk[0]) ^ l_key[4];
-    blk[1] = LE32(in_blk[1]) ^ l_key[5];
-    blk[2] = LE32(in_blk[2]) ^ l_key[6];
-    blk[3] = LE32(in_blk[3]) ^ l_key[7];
-
-	for (i = 7; i >= 0; --i)
-	{
-		t1 = g1_fun(blk[1]); t0 = g0_fun(blk[0]);
-		blk[2] = rotl(blk[2], 1) ^ (t0 + t1 + l_key[4 * (i) + 10]);
-		blk[3] = rotr(blk[3] ^ (t0 + 2 * t1 + l_key[4 * (i) + 11]), 1);
-		t1 = g1_fun(blk[3]); t0 = g0_fun(blk[2]);
-		blk[0] = rotl(blk[0], 1) ^ (t0 + t1 + l_key[4 * (i) +  8]);
-		blk[1] = rotr(blk[1] ^ (t0 + 2 * t1 + l_key[4 * (i) +  9]), 1);
-	}
-
-    out_blk[0] = LE32(blk[2] ^ l_key[0]);
-    out_blk[1] = LE32(blk[3] ^ l_key[1]);
-    out_blk[2] = LE32(blk[0] ^ l_key[2]);
-    out_blk[3] = LE32(blk[1] ^ l_key[3]); 
-};
-
-#endif // TC_MINIMIZE_CODE_SIZE
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1999, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+
+ My thanks to Doug Whiting and Niels Ferguson for comments that led
+ to improvements in this implementation.
+
+ Issue Date: 14th January 1999
+*/
+
+/* Adapted for TrueCrypt */
+/* Adapted for VeraCrypt */
+
+
+#ifdef TC_WINDOWS_BOOT
+#pragma optimize ("tl", on)
+#endif
+
+#include "Twofish.h"
+#include "Common/Endian.h"
+
+#define Q_TABLES
+#define M_TABLE
+
+#if !defined (TC_MINIMIZE_CODE_SIZE) || defined (TC_WINDOWS_BOOT_TWOFISH)
+#	define MK_TABLE
+#	define ONE_STEP
+#endif
+
+/* finite field arithmetic for GF(2**8) with the modular    */
+/* polynomial x^8 + x^6 + x^5 + x^3 + 1 (0x169)             */
+
+#define G_M 0x0169
+
+static u1byte  tab_5b[4] = { 0, G_M >> 2, G_M >> 1, (G_M >> 1) ^ (G_M >> 2) };
+static u1byte  tab_ef[4] = { 0, (G_M >> 1) ^ (G_M >> 2), G_M >> 1, G_M >> 2 };
+
+#define ffm_01(x)    (x)
+#define ffm_5b(x)   ((x) ^ ((x) >> 2) ^ tab_5b[(x) & 3])
+#define ffm_ef(x)   ((x) ^ ((x) >> 1) ^ ((x) >> 2) ^ tab_ef[(x) & 3])
+
+static u1byte ror4[16] = { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
+static u1byte ashx[16] = { 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12, 5, 14, 7 };
+
+static u1byte qt0[2][16] = 
+{   { 8, 1, 7, 13, 6, 15, 3, 2, 0, 11, 5, 9, 14, 12, 10, 4 },
+    { 2, 8, 11, 13, 15, 7, 6, 14, 3, 1, 9, 4, 0, 10, 12, 5 }
+};
+
+static u1byte qt1[2][16] =
+{   { 14, 12, 11, 8, 1, 2, 3, 5, 15, 4, 10, 6, 7, 0, 9, 13 }, 
+    { 1, 14, 2, 11, 4, 12, 3, 7, 6, 13, 10, 5, 15, 9, 0, 8 }
+};
+
+static u1byte qt2[2][16] = 
+{   { 11, 10, 5, 14, 6, 13, 9, 0, 12, 8, 15, 3, 2, 4, 7, 1 },
+    { 4, 12, 7, 5, 1, 6, 9, 10, 0, 14, 13, 8, 2, 11, 3, 15 }
+};
+
+static u1byte qt3[2][16] = 
+{   { 13, 7, 15, 4, 1, 2, 6, 14, 9, 11, 3, 0, 8, 5, 12, 10 },
+    { 11, 9, 5, 1, 12, 3, 13, 14, 6, 4, 7, 15, 2, 0, 8, 10 }
+};
+ 
+static u1byte qp(const u4byte n, const u1byte x)
+{   u1byte  a0, a1, a2, a3, a4, b0, b1, b2, b3, b4;
+
+    a0 = x >> 4; b0 = x & 15;
+    a1 = a0 ^ b0; b1 = ror4[b0] ^ ashx[a0];
+    a2 = qt0[n][a1]; b2 = qt1[n][b1];
+    a3 = a2 ^ b2; b3 = ror4[b2] ^ ashx[a2];
+    a4 = qt2[n][a3]; b4 = qt3[n][b3];
+    return (b4 << 4) | a4;
+};
+
+#ifdef  Q_TABLES
+
+static u4byte  qt_gen = 0;
+static u1byte  q_tab[2][256];
+
+#define q(n,x)  q_tab[n][x]
+
+static void gen_qtab(void)
+{   u4byte  i;
+
+    for(i = 0; i < 256; ++i)
+    {       
+        q(0,i) = qp(0, (u1byte)i);
+        q(1,i) = qp(1, (u1byte)i);
+    }
+};
+
+#else
+
+#define q(n,x)  qp(n, x)
+
+#endif
+
+#ifdef  M_TABLE
+
+static u4byte  mt_gen = 0;
+static u4byte  m_tab[4][256];
+
+static void gen_mtab(void)
+{   u4byte  i, f01, f5b, fef;
+    
+    for(i = 0; i < 256; ++i)
+    {
+        f01 = q(1,i); f5b = ffm_5b(f01); fef = ffm_ef(f01);
+        m_tab[0][i] = f01 + (f5b << 8) + (fef << 16) + (fef << 24);
+        m_tab[2][i] = f5b + (fef << 8) + (f01 << 16) + (fef << 24);
+
+        f01 = q(0,i); f5b = ffm_5b(f01); fef = ffm_ef(f01);
+        m_tab[1][i] = fef + (fef << 8) + (f5b << 16) + (f01 << 24);
+        m_tab[3][i] = f5b + (f01 << 8) + (fef << 16) + (f5b << 24);
+    }
+};
+
+#define mds(n,x)    m_tab[n][x]
+
+#else
+
+#define fm_00   ffm_01
+#define fm_10   ffm_5b
+#define fm_20   ffm_ef
+#define fm_30   ffm_ef
+#define q_0(x)  q(1,x)
+
+#define fm_01   ffm_ef
+#define fm_11   ffm_ef
+#define fm_21   ffm_5b
+#define fm_31   ffm_01
+#define q_1(x)  q(0,x)
+
+#define fm_02   ffm_5b
+#define fm_12   ffm_ef
+#define fm_22   ffm_01
+#define fm_32   ffm_ef
+#define q_2(x)  q(1,x)
+
+#define fm_03   ffm_5b
+#define fm_13   ffm_01
+#define fm_23   ffm_ef
+#define fm_33   ffm_5b
+#define q_3(x)  q(0,x)
+
+#define f_0(n,x)    ((u4byte)fm_0##n(x))
+#define f_1(n,x)    ((u4byte)fm_1##n(x) << 8)
+#define f_2(n,x)    ((u4byte)fm_2##n(x) << 16)
+#define f_3(n,x)    ((u4byte)fm_3##n(x) << 24)
+
+#define mds(n,x)    f_0(n,q_##n(x)) ^ f_1(n,q_##n(x)) ^ f_2(n,q_##n(x)) ^ f_3(n,q_##n(x))
+
+#endif
+
+static u4byte h_fun(TwofishInstance *instance, const u4byte x, const u4byte key[])
+{   u4byte  b0, b1, b2, b3;
+
+#ifndef M_TABLE
+    u4byte  m5b_b0, m5b_b1, m5b_b2, m5b_b3;
+    u4byte  mef_b0, mef_b1, mef_b2, mef_b3;
+#endif
+
+    b0 = extract_byte(x, 0); b1 = extract_byte(x, 1); b2 = extract_byte(x, 2); b3 = extract_byte(x, 3);
+
+    switch(instance->k_len)
+    {
+    case 4: b0 = q(1, (u1byte) b0) ^ extract_byte(key[3],0);
+            b1 = q(0, (u1byte) b1) ^ extract_byte(key[3],1);
+            b2 = q(0, (u1byte) b2) ^ extract_byte(key[3],2);
+            b3 = q(1, (u1byte) b3) ^ extract_byte(key[3],3);
+    case 3: b0 = q(1, (u1byte) b0) ^ extract_byte(key[2],0);
+            b1 = q(1, (u1byte) b1) ^ extract_byte(key[2],1);
+            b2 = q(0, (u1byte) b2) ^ extract_byte(key[2],2);
+            b3 = q(0, (u1byte) b3) ^ extract_byte(key[2],3);
+    case 2: b0 = q(0, (u1byte) (q(0, (u1byte) b0) ^ extract_byte(key[1],0))) ^ extract_byte(key[0],0);
+            b1 = q(0, (u1byte) (q(1, (u1byte) b1) ^ extract_byte(key[1],1))) ^ extract_byte(key[0],1);
+            b2 = q(1, (u1byte) (q(0, (u1byte) b2) ^ extract_byte(key[1],2))) ^ extract_byte(key[0],2);
+            b3 = q(1, (u1byte) (q(1, (u1byte) b3) ^ extract_byte(key[1],3))) ^ extract_byte(key[0],3);
+    }
+#ifdef  M_TABLE
+
+    return  mds(0, b0) ^ mds(1, b1) ^ mds(2, b2) ^ mds(3, b3);
+
+#else
+
+    b0 = q(1, (u1byte) b0); b1 = q(0, (u1byte) b1); b2 = q(1, (u1byte) b2); b3 = q(0, (u1byte) b3);
+    m5b_b0 = ffm_5b(b0); m5b_b1 = ffm_5b(b1); m5b_b2 = ffm_5b(b2); m5b_b3 = ffm_5b(b3);
+    mef_b0 = ffm_ef(b0); mef_b1 = ffm_ef(b1); mef_b2 = ffm_ef(b2); mef_b3 = ffm_ef(b3);
+    b0 ^= mef_b1 ^ m5b_b2 ^ m5b_b3; b3 ^= m5b_b0 ^ mef_b1 ^ mef_b2;
+    b2 ^= mef_b0 ^ m5b_b1 ^ mef_b3; b1 ^= mef_b0 ^ mef_b2 ^ m5b_b3;
+
+    return b0 | (b3 << 8) | (b2 << 16) | (b1 << 24);
+
+#endif
+};
+
+#ifdef  MK_TABLE
+
+#ifdef  ONE_STEP
+//u4byte  mk_tab[4][256];
+#else
+static u1byte  sb[4][256];
+#endif
+
+#define q20(x)  q(0,q(0,x) ^ extract_byte(key[1],0)) ^ extract_byte(key[0],0)
+#define q21(x)  q(0,q(1,x) ^ extract_byte(key[1],1)) ^ extract_byte(key[0],1)
+#define q22(x)  q(1,q(0,x) ^ extract_byte(key[1],2)) ^ extract_byte(key[0],2)
+#define q23(x)  q(1,q(1,x) ^ extract_byte(key[1],3)) ^ extract_byte(key[0],3)
+
+#define q30(x)  q(0,q(0,q(1, x) ^ extract_byte(key[2],0)) ^ extract_byte(key[1],0)) ^ extract_byte(key[0],0)
+#define q31(x)  q(0,q(1,q(1, x) ^ extract_byte(key[2],1)) ^ extract_byte(key[1],1)) ^ extract_byte(key[0],1)
+#define q32(x)  q(1,q(0,q(0, x) ^ extract_byte(key[2],2)) ^ extract_byte(key[1],2)) ^ extract_byte(key[0],2)
+#define q33(x)  q(1,q(1,q(0, x) ^ extract_byte(key[2],3)) ^ extract_byte(key[1],3)) ^ extract_byte(key[0],3)
+
+#define q40(x)  q(0,q(0,q(1, q(1, x) ^ extract_byte(key[3],0)) ^ extract_byte(key[2],0)) ^ extract_byte(key[1],0)) ^ extract_byte(key[0],0)
+#define q41(x)  q(0,q(1,q(1, q(0, x) ^ extract_byte(key[3],1)) ^ extract_byte(key[2],1)) ^ extract_byte(key[1],1)) ^ extract_byte(key[0],1)
+#define q42(x)  q(1,q(0,q(0, q(0, x) ^ extract_byte(key[3],2)) ^ extract_byte(key[2],2)) ^ extract_byte(key[1],2)) ^ extract_byte(key[0],2)
+#define q43(x)  q(1,q(1,q(0, q(1, x) ^ extract_byte(key[3],3)) ^ extract_byte(key[2],3)) ^ extract_byte(key[1],3)) ^ extract_byte(key[0],3)
+
+static void gen_mk_tab(TwofishInstance *instance, u4byte key[])
+{   u4byte  i;
+    u1byte  by;
+
+	u4byte *mk_tab = instance->mk_tab;
+
+    switch(instance->k_len)
+    {
+    case 2: for(i = 0; i < 256; ++i)
+            {
+                by = (u1byte)i;
+#ifdef ONE_STEP
+                mk_tab[0 + 4*i] = mds(0, q20(by)); mk_tab[1 + 4*i] = mds(1, q21(by));
+                mk_tab[2 + 4*i] = mds(2, q22(by)); mk_tab[3 + 4*i] = mds(3, q23(by));
+#else
+                sb[0][i] = q20(by); sb[1][i] = q21(by); 
+                sb[2][i] = q22(by); sb[3][i] = q23(by);
+#endif
+            }
+            break;
+    
+    case 3: for(i = 0; i < 256; ++i)
+            {
+                by = (u1byte)i;
+#ifdef ONE_STEP
+                mk_tab[0 + 4*i] = mds(0, q30(by)); mk_tab[1 + 4*i] = mds(1, q31(by));
+                mk_tab[2 + 4*i] = mds(2, q32(by)); mk_tab[3 + 4*i] = mds(3, q33(by));
+#else
+                sb[0][i] = q30(by); sb[1][i] = q31(by); 
+                sb[2][i] = q32(by); sb[3][i] = q33(by);
+#endif
+            }
+            break;
+    
+    case 4: for(i = 0; i < 256; ++i)
+            {
+                by = (u1byte)i;
+#ifdef ONE_STEP
+                mk_tab[0 + 4*i] = mds(0, q40(by)); mk_tab[1 + 4*i] = mds(1, q41(by));
+                mk_tab[2 + 4*i] = mds(2, q42(by)); mk_tab[3 + 4*i] = mds(3, q43(by));
+#else
+                sb[0][i] = q40(by); sb[1][i] = q41(by); 
+                sb[2][i] = q42(by); sb[3][i] = q43(by);
+#endif
+            }
+    }
+};
+
+#  ifdef ONE_STEP
+#    define g0_fun(x) ( mk_tab[0 + 4*extract_byte(x,0)] ^ mk_tab[1 + 4*extract_byte(x,1)] \
+                      ^ mk_tab[2 + 4*extract_byte(x,2)] ^ mk_tab[3 + 4*extract_byte(x,3)] )
+#    define g1_fun(x) ( mk_tab[0 + 4*extract_byte(x,3)] ^ mk_tab[1 + 4*extract_byte(x,0)] \
+                      ^ mk_tab[2 + 4*extract_byte(x,1)] ^ mk_tab[3 + 4*extract_byte(x,2)] )
+
+
+#  else
+#    define g0_fun(x) ( mds(0, sb[0][extract_byte(x,0)]) ^ mds(1, sb[1][extract_byte(x,1)]) \
+                      ^ mds(2, sb[2][extract_byte(x,2)]) ^ mds(3, sb[3][extract_byte(x,3)]) )
+#    define g1_fun(x) ( mds(0, sb[0][extract_byte(x,3)]) ^ mds(1, sb[1][extract_byte(x,0)]) \
+                      ^ mds(2, sb[2][extract_byte(x,1)]) ^ mds(3, sb[3][extract_byte(x,2)]) )
+#  endif
+
+#else
+
+#define g0_fun(x)   h_fun(instance, x, instance->s_key)
+#define g1_fun(x)   h_fun(instance, rotl(x,8), instance->s_key)
+
+#endif
+
+/* The (12,8) Reed Soloman code has the generator polynomial
+
+  g(x) = x^4 + (a + 1/a) * x^3 + a * x^2 + (a + 1/a) * x + 1
+
+where the coefficients are in the finite field GF(2^8) with a
+modular polynomial a^8 + a^6 + a^3 + a^2 + 1. To generate the
+remainder we have to start with a 12th order polynomial with our
+eight input bytes as the coefficients of the 4th to 11th terms. 
+That is:
+
+  m[7] * x^11 + m[6] * x^10 ... + m[0] * x^4 + 0 * x^3 +... + 0
+  
+We then multiply the generator polynomial by m[7] * x^7 and subtract
+it - xor in GF(2^8) - from the above to eliminate the x^7 term (the 
+artihmetic on the coefficients is done in GF(2^8). We then multiply 
+the generator polynomial by x^6 * coeff(x^10) and use this to remove
+the x^10 term. We carry on in this way until the x^4 term is removed
+so that we are left with:
+
+  r[3] * x^3 + r[2] * x^2 + r[1] 8 x^1 + r[0]
+
+which give the resulting 4 bytes of the remainder. This is equivalent 
+to the matrix multiplication in the Twofish description but much faster 
+to implement.
+
+*/
+
+#define G_MOD   0x0000014d
+
+static u4byte mds_rem(u4byte p0, u4byte p1)
+{   u4byte  i, t, u;
+
+    for(i = 0; i < 8; ++i)
+    {
+        t = p1 >> 24;   // get most significant coefficient
+        
+        p1 = (p1 << 8) | (p0 >> 24); p0 <<= 8;  // shift others up
+            
+        // multiply t by a (the primitive element - i.e. left shift)
+
+        u = (t << 1); 
+        
+        if(t & 0x80)            // subtract modular polynomial on overflow
+        
+            u ^= G_MOD; 
+
+        p1 ^= t ^ (u << 16);    // remove t * (a * x^2 + 1)  
+
+        u ^= (t >> 1);          // form u = a * t + t / a = t * (a + 1 / a); 
+        
+        if(t & 0x01)            // add the modular polynomial on underflow
+        
+            u ^= G_MOD >> 1;
+
+        p1 ^= (u << 24) | (u << 8); // remove t * (a + 1/a) * (x^3 + x)
+    }
+
+    return p1;
+};
+
+/* initialise the key schedule from the user supplied key   */
+
+u4byte *twofish_set_key(TwofishInstance *instance, const u4byte in_key[])
+{   u4byte  i, a, b, me_key[4], mo_key[4];
+	u4byte *l_key, *s_key;
+
+	l_key = instance->l_key;
+	s_key = instance->s_key;
+
+#ifdef Q_TABLES
+    if(!qt_gen)
+    {
+        gen_qtab(); qt_gen = 1;
+    }
+#endif
+
+#ifdef M_TABLE
+    if(!mt_gen)
+    {
+        gen_mtab(); mt_gen = 1;
+    }
+#endif
+
+    instance->k_len = 4;
+
+    for(i = 0; i < instance->k_len; ++i)
+    {
+        a = LE32(in_key[i + i]);     me_key[i] = a;
+        b = LE32(in_key[i + i + 1]); mo_key[i] = b;
+        s_key[instance->k_len - i - 1] = mds_rem(a, b);
+    }
+
+    for(i = 0; i < 40; i += 2)
+    {
+        a = 0x01010101 * i; b = a + 0x01010101;
+        a = h_fun(instance, a, me_key);
+        b = rotl(h_fun(instance, b, mo_key), 8);
+        l_key[i] = a + b;
+        l_key[i + 1] = rotl(a + 2 * b, 9);
+    }
+
+#ifdef MK_TABLE
+    gen_mk_tab(instance, s_key);
+#endif
+
+    return l_key;
+};
+
+/* encrypt a block of text  */
+
+#ifndef TC_MINIMIZE_CODE_SIZE
+
+#define f_rnd(i)                                                    \
+    t1 = g1_fun(blk[1]); t0 = g0_fun(blk[0]);                       \
+    blk[2] = rotr(blk[2] ^ (t0 + t1 + l_key[4 * (i) + 8]), 1);      \
+    blk[3] = rotl(blk[3], 1) ^ (t0 + 2 * t1 + l_key[4 * (i) + 9]);  \
+    t1 = g1_fun(blk[3]); t0 = g0_fun(blk[2]);                       \
+    blk[0] = rotr(blk[0] ^ (t0 + t1 + l_key[4 * (i) + 10]), 1);     \
+    blk[1] = rotl(blk[1], 1) ^ (t0 + 2 * t1 + l_key[4 * (i) + 11])
+
+void twofish_encrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[])
+{   u4byte  t0, t1, blk[4];
+
+	u4byte *l_key = instance->l_key;
+	u4byte *mk_tab = instance->mk_tab;
+
+	blk[0] = LE32(in_blk[0]) ^ l_key[0];
+    blk[1] = LE32(in_blk[1]) ^ l_key[1];
+    blk[2] = LE32(in_blk[2]) ^ l_key[2];
+    blk[3] = LE32(in_blk[3]) ^ l_key[3];
+
+    f_rnd(0); f_rnd(1); f_rnd(2); f_rnd(3);
+    f_rnd(4); f_rnd(5); f_rnd(6); f_rnd(7);
+
+    out_blk[0] = LE32(blk[2] ^ l_key[4]);
+    out_blk[1] = LE32(blk[3] ^ l_key[5]);
+    out_blk[2] = LE32(blk[0] ^ l_key[6]);
+    out_blk[3] = LE32(blk[1] ^ l_key[7]); 
+};
+
+#else // TC_MINIMIZE_CODE_SIZE
+
+void twofish_encrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[])
+{   u4byte  t0, t1, blk[4];
+
+	u4byte *l_key = instance->l_key;
+#ifdef TC_WINDOWS_BOOT_TWOFISH
+	u4byte *mk_tab = instance->mk_tab;
+#endif
+	int i;
+
+	blk[0] = LE32(in_blk[0]) ^ l_key[0];
+    blk[1] = LE32(in_blk[1]) ^ l_key[1];
+    blk[2] = LE32(in_blk[2]) ^ l_key[2];
+    blk[3] = LE32(in_blk[3]) ^ l_key[3];
+
+	for (i = 0; i <= 7; ++i)
+	{
+		t1 = g1_fun(blk[1]); t0 = g0_fun(blk[0]);
+		blk[2] = rotr(blk[2] ^ (t0 + t1 + l_key[4 * (i) + 8]), 1);
+		blk[3] = rotl(blk[3], 1) ^ (t0 + 2 * t1 + l_key[4 * (i) + 9]);
+		t1 = g1_fun(blk[3]); t0 = g0_fun(blk[2]);
+		blk[0] = rotr(blk[0] ^ (t0 + t1 + l_key[4 * (i) + 10]), 1);
+		blk[1] = rotl(blk[1], 1) ^ (t0 + 2 * t1 + l_key[4 * (i) + 11]);
+	}
+
+    out_blk[0] = LE32(blk[2] ^ l_key[4]);
+    out_blk[1] = LE32(blk[3] ^ l_key[5]);
+    out_blk[2] = LE32(blk[0] ^ l_key[6]);
+    out_blk[3] = LE32(blk[1] ^ l_key[7]); 
+};
+
+#endif // TC_MINIMIZE_CODE_SIZE
+
+/* decrypt a block of text  */
+
+#ifndef TC_MINIMIZE_CODE_SIZE
+
+#define i_rnd(i)                                                        \
+        t1 = g1_fun(blk[1]); t0 = g0_fun(blk[0]);                       \
+        blk[2] = rotl(blk[2], 1) ^ (t0 + t1 + l_key[4 * (i) + 10]);     \
+        blk[3] = rotr(blk[3] ^ (t0 + 2 * t1 + l_key[4 * (i) + 11]), 1); \
+        t1 = g1_fun(blk[3]); t0 = g0_fun(blk[2]);                       \
+        blk[0] = rotl(blk[0], 1) ^ (t0 + t1 + l_key[4 * (i) +  8]);     \
+        blk[1] = rotr(blk[1] ^ (t0 + 2 * t1 + l_key[4 * (i) +  9]), 1)
+
+void twofish_decrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[4])
+{   u4byte  t0, t1, blk[4];
+
+	u4byte *l_key = instance->l_key;
+	u4byte *mk_tab = instance->mk_tab;
+
+    blk[0] = LE32(in_blk[0]) ^ l_key[4];
+    blk[1] = LE32(in_blk[1]) ^ l_key[5];
+    blk[2] = LE32(in_blk[2]) ^ l_key[6];
+    blk[3] = LE32(in_blk[3]) ^ l_key[7];
+
+    i_rnd(7); i_rnd(6); i_rnd(5); i_rnd(4);
+    i_rnd(3); i_rnd(2); i_rnd(1); i_rnd(0);
+
+    out_blk[0] = LE32(blk[2] ^ l_key[0]);
+    out_blk[1] = LE32(blk[3] ^ l_key[1]);
+    out_blk[2] = LE32(blk[0] ^ l_key[2]);
+    out_blk[3] = LE32(blk[1] ^ l_key[3]); 
+};
+
+#else // TC_MINIMIZE_CODE_SIZE
+
+void twofish_decrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[4])
+{   u4byte  t0, t1, blk[4];
+
+	u4byte *l_key = instance->l_key;
+#ifdef TC_WINDOWS_BOOT_TWOFISH
+	u4byte *mk_tab = instance->mk_tab;
+#endif
+	int i;
+
+    blk[0] = LE32(in_blk[0]) ^ l_key[4];
+    blk[1] = LE32(in_blk[1]) ^ l_key[5];
+    blk[2] = LE32(in_blk[2]) ^ l_key[6];
+    blk[3] = LE32(in_blk[3]) ^ l_key[7];
+
+	for (i = 7; i >= 0; --i)
+	{
+		t1 = g1_fun(blk[1]); t0 = g0_fun(blk[0]);
+		blk[2] = rotl(blk[2], 1) ^ (t0 + t1 + l_key[4 * (i) + 10]);
+		blk[3] = rotr(blk[3] ^ (t0 + 2 * t1 + l_key[4 * (i) + 11]), 1);
+		t1 = g1_fun(blk[3]); t0 = g0_fun(blk[2]);
+		blk[0] = rotl(blk[0], 1) ^ (t0 + t1 + l_key[4 * (i) +  8]);
+		blk[1] = rotr(blk[1] ^ (t0 + 2 * t1 + l_key[4 * (i) +  9]), 1);
+	}
+
+    out_blk[0] = LE32(blk[2] ^ l_key[0]);
+    out_blk[1] = LE32(blk[3] ^ l_key[1]);
+    out_blk[2] = LE32(blk[0] ^ l_key[2]);
+    out_blk[3] = LE32(blk[1] ^ l_key[3]); 
+};
+
+#endif // TC_MINIMIZE_CODE_SIZE
diff --git a/src/Crypto/Twofish.h b/src/Crypto/Twofish.h
index ed400257..1011608e 100644
--- a/src/Crypto/Twofish.h
+++ b/src/Crypto/Twofish.h
@@ -1,56 +1,56 @@
-#ifndef TWOFISH_H
-#define TWOFISH_H
-
-#include "Common/Tcdefs.h"
-
-#if defined(__cplusplus)
-extern "C"
-{
-#endif
-
-#ifndef u4byte
-#define u4byte	unsigned __int32
-#endif
-#ifndef u1byte
-#define u1byte	unsigned char
-#endif
-
-#ifndef extract_byte
-#define extract_byte(x,n)   ((u1byte)((x) >> (8 * n)))
-#endif
-
-#ifndef rotl
-
-#ifdef _WIN32
-#include <stdlib.h>
-#pragma intrinsic(_lrotr,_lrotl)
-#define rotr(x,n) _lrotr(x,n)
-#define rotl(x,n) _lrotl(x,n)
-#else
-#define rotr(x,n) (((x)>>(n))|((x)<<(32-(n))))
-#define rotl(x,n) (((x)<<(n))|((x)>>(32-(n))))
-#endif
-
-#endif
-typedef struct
-{
-	u4byte l_key[40];
-	u4byte s_key[4];
-#if !defined (TC_MINIMIZE_CODE_SIZE) || defined (TC_WINDOWS_BOOT_TWOFISH)
-	u4byte mk_tab[4 * 256];
-#endif
-	u4byte k_len;
-} TwofishInstance;
-
-#define TWOFISH_KS		sizeof(TwofishInstance)
-
-/* in_key must be 32-bytes long */
-u4byte * twofish_set_key(TwofishInstance *instance, const u4byte in_key[]);
-void twofish_encrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[]);
-void twofish_decrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[4]);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif // TWOFISH_H
+#ifndef TWOFISH_H
+#define TWOFISH_H
+
+#include "Common/Tcdefs.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#ifndef u4byte
+#define u4byte	unsigned __int32
+#endif
+#ifndef u1byte
+#define u1byte	unsigned char
+#endif
+
+#ifndef extract_byte
+#define extract_byte(x,n)   ((u1byte)((x) >> (8 * n)))
+#endif
+
+#ifndef rotl
+
+#ifdef _WIN32
+#include <stdlib.h>
+#pragma intrinsic(_lrotr,_lrotl)
+#define rotr(x,n) _lrotr(x,n)
+#define rotl(x,n) _lrotl(x,n)
+#else
+#define rotr(x,n) (((x)>>(n))|((x)<<(32-(n))))
+#define rotl(x,n) (((x)<<(n))|((x)>>(32-(n))))
+#endif
+
+#endif
+typedef struct
+{
+	u4byte l_key[40];
+	u4byte s_key[4];
+#if !defined (TC_MINIMIZE_CODE_SIZE) || defined (TC_WINDOWS_BOOT_TWOFISH)
+	u4byte mk_tab[4 * 256];
+#endif
+	u4byte k_len;
+} TwofishInstance;
+
+#define TWOFISH_KS		sizeof(TwofishInstance)
+
+/* in_key must be 32-bytes long */
+u4byte * twofish_set_key(TwofishInstance *instance, const u4byte in_key[]);
+void twofish_encrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[]);
+void twofish_decrypt(TwofishInstance *instance, const u4byte in_blk[4], u4byte out_blk[4]);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // TWOFISH_H
diff --git a/src/Crypto/Whirlpool.h b/src/Crypto/Whirlpool.h
index df8aa7ac..9e771935 100644
--- a/src/Crypto/Whirlpool.h
+++ b/src/Crypto/Whirlpool.h
@@ -1,27 +1,27 @@
-#ifndef WHIRLPOOL_H
-#define WHIRLPOOL_H 1
-
-#include "Common/Tcdefs.h"
-#include "config.h"
-
-typedef struct WHIRLPOOL_CTX {
-	uint64 countLo;
-	uint64 countHi;
-	CRYPTOPP_ALIGN_DATA(16) uint64 data[8];
-	CRYPTOPP_ALIGN_DATA(16) uint64 state[8];
-} WHIRLPOOL_CTX;
-
-// -------------
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-void WHIRLPOOL_add(const unsigned char * source, unsigned __int32 sourceBits, WHIRLPOOL_CTX * const ctx);
-void WHIRLPOOL_finalize(WHIRLPOOL_CTX* const ctx, unsigned char * result);
-void WHIRLPOOL_init(WHIRLPOOL_CTX* const ctx);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* WHIRLPOOL_H */
+#ifndef WHIRLPOOL_H
+#define WHIRLPOOL_H 1
+
+#include "Common/Tcdefs.h"
+#include "config.h"
+
+typedef struct WHIRLPOOL_CTX {
+	uint64 countLo;
+	uint64 countHi;
+	CRYPTOPP_ALIGN_DATA(16) uint64 data[8];
+	CRYPTOPP_ALIGN_DATA(16) uint64 state[8];
+} WHIRLPOOL_CTX;
+
+// -------------
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void WHIRLPOOL_add(const unsigned char * source, unsigned __int32 sourceBits, WHIRLPOOL_CTX * const ctx);
+void WHIRLPOOL_finalize(WHIRLPOOL_CTX* const ctx, unsigned char * result);
+void WHIRLPOOL_init(WHIRLPOOL_CTX* const ctx);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* WHIRLPOOL_H */
diff --git a/src/Crypto/cpu.c b/src/Crypto/cpu.c
index 58a131af..4274a8ae 100644
--- a/src/Crypto/cpu.c
+++ b/src/Crypto/cpu.c
@@ -1,231 +1,231 @@
-/* cpu.c - written and placed in the public domain by Wei Dai */
-
-#include "cpu.h"
-#include "misc.h"
-
-#ifndef EXCEPTION_EXECUTE_HANDLER
-#define EXCEPTION_EXECUTE_HANDLER 1
-#endif
-
-#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
-#include <signal.h>
-#include <setjmp.h>
-#endif
-
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
-#include <emmintrin.h>
-#endif
-
-#ifdef CRYPTOPP_CPUID_AVAILABLE
-
-#if _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64
-
-int CpuId(uint32 input, uint32 output[4])
-{
-	__cpuid((int *)output, input);
-	return 1;
-}
-
-#else
-
-#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-typedef void (*SigHandler)(int);
-
-static jmp_buf s_jmpNoCPUID;
-static void SigIllHandlerCPUID(int p)
-{
-	longjmp(s_jmpNoCPUID, 1);
-}
-
-#if CRYPTOPP_BOOL_X64 == 0
-static jmp_buf s_jmpNoSSE2;
-static void SigIllHandlerSSE2(int p)
-{
-	longjmp(s_jmpNoSSE2, 1);
-}
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-#endif
-
-int CpuId(uint32 input, uint32 output[4])
-{
-#ifdef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
-    __try
-	{
-		__asm
-		{
-			mov eax, input
-            mov ecx, 0
-			cpuid
-			mov edi, output
-			mov [edi], eax
-			mov [edi+4], ebx
-			mov [edi+8], ecx
-			mov [edi+12], edx
-		}
-	}
-    __except (EXCEPTION_EXECUTE_HANDLER)
-	{
-		return 0;
-    }
-    
-	// function 0 returns the highest basic function understood in EAX
-	if(input == 0)
-        return !!output[0]? 1 : 0;
-
-	return 1;
-#else
-	// longjmp and clobber warnings. Volatile is required.
-	// http://github.com/weidai11/cryptopp/issues/24
-	// http://stackoverflow.com/q/7721854
-	volatile int result = 1;
-
-	SigHandler oldHandler = signal(SIGILL, SigIllHandlerCPUID);
-	if (oldHandler == SIG_ERR)
-		result = 0;
-
-	if (setjmp(s_jmpNoCPUID))
-		result = 0;
-	else
-	{
-		asm volatile
-		(
-            // save ebx in case -fPIC is being used
-            // TODO: this might need an early clobber on EDI.
-#if CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
-            "pushq %%rbx; cpuid; mov %%ebx, %%edi; popq %%rbx"
-#else
-            "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
-#endif
-            : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
-            : "a" (input), "c" (0)
-         );
-	}
-
-	signal(SIGILL, oldHandler);
-	return result;
-#endif
-}
-
-#endif
-
-static int TrySSE2()
-{
-#if CRYPTOPP_BOOL_X64
-	return 1;
-#elif defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
-    __try
-	{
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
-        AS2(por xmm0, xmm0)        // executing SSE2 instruction
-#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
-		__m128i x = _mm_setzero_si128();
-		return _mm_cvtsi128_si32(x) == 0 ? 1 : 0;
-#endif
-	}
-    __except (EXCEPTION_EXECUTE_HANDLER)
-	{
-		return 0;
-    }
-	return 1;
-#else
-	// longjmp and clobber warnings. Volatile is required.
-	// http://github.com/weidai11/cryptopp/issues/24
-	// http://stackoverflow.com/q/7721854
-	volatile int result = 1;
-
-	SigHandler oldHandler = signal(SIGILL, SigIllHandlerSSE2);
-	if (oldHandler == SIG_ERR)
-		return 0;
-
-	if (setjmp(s_jmpNoSSE2))
-		result = 1;
-	else
-	{
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
-		__asm __volatile ("por %xmm0, %xmm0");
-#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
-		__m128i x = _mm_setzero_si128();
-		result = _mm_cvtsi128_si32(x) == 0? 1 : 0;
-#endif
-	}
-
-	signal(SIGILL, oldHandler);
-	return result;
-#endif
-}
-
-int g_x86DetectionDone = 0;
-int g_hasISSE = 0, g_hasSSE2 = 0, g_hasSSSE3 = 0, g_hasMMX = 0, g_hasAESNI = 0, g_hasCLMUL = 0, g_isP4 = 0;
-uint32 g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
-
-VC_INLINE int IsIntel(const uint32 output[4])
-{
-	// This is the "GenuineIntel" string
-	return (output[1] /*EBX*/ == 0x756e6547) &&
-    (output[2] /*ECX*/ == 0x6c65746e) &&
-    (output[3] /*EDX*/ == 0x49656e69);
-}
-
-VC_INLINE int IsAMD(const uint32 output[4])
-{
-	// This is the "AuthenticAMD" string
-	return (output[1] /*EBX*/ == 0x68747541) &&
-    (output[2] /*ECX*/ == 0x69746E65) &&
-    (output[3] /*EDX*/ == 0x444D4163);
-}
-
-void DetectX86Features()
-{
-	uint32 cpuid[4], cpuid1[4];
-	if (!CpuId(0, cpuid))
-		return;
-	if (!CpuId(1, cpuid1))
-		return;
-
-	g_hasMMX = (cpuid1[3] & (1 << 23)) != 0;
-	if ((cpuid1[3] & (1 << 26)) != 0)
-		g_hasSSE2 = TrySSE2();
-	g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9));
-	g_hasAESNI = g_hasSSE2 && (cpuid1[2] & (1<<25));
-	g_hasCLMUL = g_hasSSE2 && (cpuid1[2] & (1<<1));
-
-	if ((cpuid1[3] & (1 << 25)) != 0)
-		g_hasISSE = 1;
-	else
-	{
-		uint32 cpuid2[4];
-		CpuId(0x080000000, cpuid2);
-		if (cpuid2[0] >= 0x080000001)
-		{
-			CpuId(0x080000001, cpuid2);
-			g_hasISSE = (cpuid2[3] & (1 << 22)) != 0;
-		}
-	}
-
-	if (IsIntel(cpuid))
-	{
-		g_isP4 = ((cpuid1[0] >> 8) & 0xf) == 0xf;
-		g_cacheLineSize = 8 * GETBYTE(cpuid1[1], 1);
-	}
-	else if (IsAMD(cpuid))
-	{
-		CpuId(0x80000005, cpuid);
-		g_cacheLineSize = GETBYTE(cpuid[2], 0);
-	}
-
-	if (!g_cacheLineSize)
-		g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
-
-	*((volatile int*)&g_x86DetectionDone) = 1;
-}
-
-#endif
+/* cpu.c - written and placed in the public domain by Wei Dai */
+
+#include "cpu.h"
+#include "misc.h"
+
+#ifndef EXCEPTION_EXECUTE_HANDLER
+#define EXCEPTION_EXECUTE_HANDLER 1
+#endif
+
+#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
+#include <signal.h>
+#include <setjmp.h>
+#endif
+
+#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+#include <emmintrin.h>
+#endif
+
+#ifdef CRYPTOPP_CPUID_AVAILABLE
+
+#if _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64
+
+int CpuId(uint32 input, uint32 output[4])
+{
+	__cpuid((int *)output, input);
+	return 1;
+}
+
+#else
+
+#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef void (*SigHandler)(int);
+
+static jmp_buf s_jmpNoCPUID;
+static void SigIllHandlerCPUID(int p)
+{
+	longjmp(s_jmpNoCPUID, 1);
+}
+
+#if CRYPTOPP_BOOL_X64 == 0
+static jmp_buf s_jmpNoSSE2;
+static void SigIllHandlerSSE2(int p)
+{
+	longjmp(s_jmpNoSSE2, 1);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+#endif
+
+int CpuId(uint32 input, uint32 output[4])
+{
+#ifdef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
+    __try
+	{
+		__asm
+		{
+			mov eax, input
+            mov ecx, 0
+			cpuid
+			mov edi, output
+			mov [edi], eax
+			mov [edi+4], ebx
+			mov [edi+8], ecx
+			mov [edi+12], edx
+		}
+	}
+    __except (EXCEPTION_EXECUTE_HANDLER)
+	{
+		return 0;
+    }
+    
+	// function 0 returns the highest basic function understood in EAX
+	if(input == 0)
+        return !!output[0]? 1 : 0;
+
+	return 1;
+#else
+	// longjmp and clobber warnings. Volatile is required.
+	// http://github.com/weidai11/cryptopp/issues/24
+	// http://stackoverflow.com/q/7721854
+	volatile int result = 1;
+
+	SigHandler oldHandler = signal(SIGILL, SigIllHandlerCPUID);
+	if (oldHandler == SIG_ERR)
+		result = 0;
+
+	if (setjmp(s_jmpNoCPUID))
+		result = 0;
+	else
+	{
+		asm volatile
+		(
+            // save ebx in case -fPIC is being used
+            // TODO: this might need an early clobber on EDI.
+#if CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
+            "pushq %%rbx; cpuid; mov %%ebx, %%edi; popq %%rbx"
+#else
+            "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
+#endif
+            : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
+            : "a" (input), "c" (0)
+         );
+	}
+
+	signal(SIGILL, oldHandler);
+	return result;
+#endif
+}
+
+#endif
+
+static int TrySSE2()
+{
+#if CRYPTOPP_BOOL_X64
+	return 1;
+#elif defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
+    __try
+	{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+        AS2(por xmm0, xmm0)        // executing SSE2 instruction
+#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+		__m128i x = _mm_setzero_si128();
+		return _mm_cvtsi128_si32(x) == 0 ? 1 : 0;
+#endif
+	}
+    __except (EXCEPTION_EXECUTE_HANDLER)
+	{
+		return 0;
+    }
+	return 1;
+#else
+	// longjmp and clobber warnings. Volatile is required.
+	// http://github.com/weidai11/cryptopp/issues/24
+	// http://stackoverflow.com/q/7721854
+	volatile int result = 1;
+
+	SigHandler oldHandler = signal(SIGILL, SigIllHandlerSSE2);
+	if (oldHandler == SIG_ERR)
+		return 0;
+
+	if (setjmp(s_jmpNoSSE2))
+		result = 1;
+	else
+	{
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+		__asm __volatile ("por %xmm0, %xmm0");
+#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+		__m128i x = _mm_setzero_si128();
+		result = _mm_cvtsi128_si32(x) == 0? 1 : 0;
+#endif
+	}
+
+	signal(SIGILL, oldHandler);
+	return result;
+#endif
+}
+
+int g_x86DetectionDone = 0;
+int g_hasISSE = 0, g_hasSSE2 = 0, g_hasSSSE3 = 0, g_hasMMX = 0, g_hasAESNI = 0, g_hasCLMUL = 0, g_isP4 = 0;
+uint32 g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
+
+VC_INLINE int IsIntel(const uint32 output[4])
+{
+	// This is the "GenuineIntel" string
+	return (output[1] /*EBX*/ == 0x756e6547) &&
+    (output[2] /*ECX*/ == 0x6c65746e) &&
+    (output[3] /*EDX*/ == 0x49656e69);
+}
+
+VC_INLINE int IsAMD(const uint32 output[4])
+{
+	// This is the "AuthenticAMD" string
+	return (output[1] /*EBX*/ == 0x68747541) &&
+    (output[2] /*ECX*/ == 0x69746E65) &&
+    (output[3] /*EDX*/ == 0x444D4163);
+}
+
+void DetectX86Features()
+{
+	uint32 cpuid[4], cpuid1[4];
+	if (!CpuId(0, cpuid))
+		return;
+	if (!CpuId(1, cpuid1))
+		return;
+
+	g_hasMMX = (cpuid1[3] & (1 << 23)) != 0;
+	if ((cpuid1[3] & (1 << 26)) != 0)
+		g_hasSSE2 = TrySSE2();
+	g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9));
+	g_hasAESNI = g_hasSSE2 && (cpuid1[2] & (1<<25));
+	g_hasCLMUL = g_hasSSE2 && (cpuid1[2] & (1<<1));
+
+	if ((cpuid1[3] & (1 << 25)) != 0)
+		g_hasISSE = 1;
+	else
+	{
+		uint32 cpuid2[4];
+		CpuId(0x080000000, cpuid2);
+		if (cpuid2[0] >= 0x080000001)
+		{
+			CpuId(0x080000001, cpuid2);
+			g_hasISSE = (cpuid2[3] & (1 << 22)) != 0;
+		}
+	}
+
+	if (IsIntel(cpuid))
+	{
+		g_isP4 = ((cpuid1[0] >> 8) & 0xf) == 0xf;
+		g_cacheLineSize = 8 * GETBYTE(cpuid1[1], 1);
+	}
+	else if (IsAMD(cpuid))
+	{
+		CpuId(0x80000005, cpuid);
+		g_cacheLineSize = GETBYTE(cpuid[2], 0);
+	}
+
+	if (!g_cacheLineSize)
+		g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
+
+	*((volatile int*)&g_x86DetectionDone) = 1;
+}
+
+#endif
diff --git a/src/Crypto/cpu.h b/src/Crypto/cpu.h
index da8d14cb..7ef509ec 100644
--- a/src/Crypto/cpu.h
+++ b/src/Crypto/cpu.h
@@ -1,308 +1,308 @@
-#ifndef CRYPTOPP_CPU_H
-#define CRYPTOPP_CPU_H
-
-#include "Common/Tcdefs.h"
-#include "config.h"
-
-#ifdef CRYPTOPP_GENERATE_X64_MASM
-
-#define CRYPTOPP_X86_ASM_AVAILABLE
-#define CRYPTOPP_BOOL_X64 1
-#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 1
-
-#else
-
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
-#include <emmintrin.h>
-#endif
-
-#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
-#if defined(__SSSE3__) || defined(__INTEL_COMPILER)
-#ifdef TC_WINDOWS_DRIVER
-extern __m128i _mm_shuffle_epi8 (__m128i a, __m128i b);
-#else
-#include <tmmintrin.h>
-#endif
-#endif
-
-#if defined(__SSE4_1__) || defined(__INTEL_COMPILER)
-#ifdef TC_WINDOWS_DRIVER
-extern int   _mm_extract_epi32(__m128i src, const int ndx);
-extern __m128i _mm_insert_epi32(__m128i dst, int s, const int ndx);
-#else
-#include <smmintrin.h>
-#endif
-#endif
-
-#if (defined(__AES__) && defined(__PCLMUL__)) || defined(__INTEL_COMPILER)
-#ifdef TC_WINDOWS_DRIVER
-extern __m128i _mm_clmulepi64_si128(__m128i v1, __m128i v2, 
-					    const int imm8);
-extern __m128i _mm_aeskeygenassist_si128(__m128i ckey, const int rcon);
-extern __m128i _mm_aesimc_si128(__m128i v);
-extern __m128i _mm_aesenc_si128(__m128i v, __m128i rkey);
-extern __m128i _mm_aesenclast_si128(__m128i v, __m128i rkey);
-extern __m128i _mm_aesdec_si128(__m128i v, __m128i rkey);
-extern __m128i _mm_aesdeclast_si128(__m128i v, __m128i rkey);
-#else
-#include <wmmintrin.h>
-#endif
-#endif
-#endif
-
-#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
-
-#define CRYPTOPP_CPUID_AVAILABLE
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-// these should not be used directly
-extern int g_x86DetectionDone;
-extern int g_hasSSSE3;
-extern int g_hasAESNI;
-extern int g_hasCLMUL;
-extern int g_isP4;
-extern uint32 g_cacheLineSize;
-void DetectX86Features(); // must be called at the start of the program/driver
-int CpuId(uint32 input, uint32 *output);
-
-#if CRYPTOPP_BOOL_X64
-#define HasSSE2()	1
-#define HasISSE()	1
-#define HasMMX()	1
-#else
-
-extern int g_hasSSE2;
-extern int g_hasISSE;
-extern int g_hasMMX;
-
-#define HasSSE2()	g_hasSSE2
-#define HasISSE()	g_hasISSE
-#define HasMMX()	g_hasMMX
-
-#endif
-
-#define HasSSSE3() g_hasSSSE3
-#define HasAESNI() g_hasAESNI
-#define HasCLMUL() g_hasCLMUL
-#define IsP4() g_isP4
-#define GetCacheLineSize() g_cacheLineSize
-
-#if defined(__cplusplus)
-}
-#endif
-
-#else
-
-#define GetCacheLineSize()	CRYPTOPP_L1_CACHE_LINE_SIZE
-
-#endif
-
-#endif
-
-#ifdef CRYPTOPP_GENERATE_X64_MASM
-	#define AS1(x) x*newline*
-	#define AS2(x, y) x, y*newline*
-	#define AS3(x, y, z) x, y, z*newline*
-	#define ASS(x, y, a, b, c, d) x, y, a*64+b*16+c*4+d*newline*
-	#define ASL(x) label##x:*newline*
-	#define ASJ(x, y, z) x label##y*newline*
-	#define ASC(x, y) x label##y*newline*
-	#define AS_HEX(y) 0##y##h
-#elif defined(_MSC_VER) || defined(__BORLANDC__)
-	#define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
-	#define AS1(x) __asm {x}
-	#define AS2(x, y) __asm {x, y}
-	#define AS3(x, y, z) __asm {x, y, z}
-	#define ASS(x, y, a, b, c, d) __asm {x, y, (a)*64+(b)*16+(c)*4+(d)}
-	#define ASL(x) __asm {label##x:}
-	#define ASJ(x, y, z) __asm {x label##y}
-	#define ASC(x, y) __asm {x label##y}
-	#define CRYPTOPP_NAKED __declspec(naked)
-	#define AS_HEX(y) 0x##y
-#else
-	#define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
-
-    #if defined(CRYPTOPP_CLANG_VERSION) || defined(CRYPTOPP_APPLE_CLANG_VERSION)
-        #define NEW_LINE "\n"
-        #define INTEL_PREFIX ".intel_syntax;"
-        #define INTEL_NOPREFIX ".intel_syntax;"
-        #define ATT_PREFIX ".att_syntax;"
-        #define ATT_NOPREFIX ".att_syntax;"
-    #else
-        #define NEW_LINE
-        #define INTEL_PREFIX ".intel_syntax prefix;"
-        #define INTEL_NOPREFIX ".intel_syntax noprefix;"
-        #define ATT_PREFIX ".att_syntax prefix;"
-        #define ATT_NOPREFIX ".att_syntax noprefix;"
-        #endif
-
-    // define these in two steps to allow arguments to be expanded
-    #define GNU_AS1(x) #x ";" NEW_LINE
-    #define GNU_AS2(x, y) #x ", " #y ";" NEW_LINE
-    #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";" NEW_LINE
-    #define GNU_ASL(x) "\n" #x ":" NEW_LINE
-    #define GNU_ASJ(x, y, z) #x " " #y #z ";" NEW_LINE
-    #define AS1(x) GNU_AS1(x)
-    #define AS2(x, y) GNU_AS2(x, y)
-    #define AS3(x, y, z) GNU_AS3(x, y, z)
-    #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
-    #define ASL(x) GNU_ASL(x)
-    #define ASJ(x, y, z) GNU_ASJ(x, y, z)
-    #define ASC(x, y) #x " " #y ";"
-    #define CRYPTOPP_NAKED
-    #define AS_HEX(y) 0x##y
-#endif
-
-#define IF0(y)
-#define IF1(y) y
-
-// Should be confined to GCC, but its used to help manage Clang 3.4 compiler error.
-//   Also see LLVM Bug 24232, http://llvm.org/bugs/show_bug.cgi?id=24232 .
-#ifndef INTEL_PREFIX
-#define INTEL_PREFIX
-#endif
-#ifndef INTEL_NOPREFIX
-#define INTEL_NOPREFIX
-#endif
-#ifndef ATT_PREFIX
-#define ATT_PREFIX
-#endif
-#ifndef ATT_NOPREFIX
-#define ATT_NOPREFIX
-#endif
-
-#ifdef CRYPTOPP_GENERATE_X64_MASM
-#define ASM_MOD(x, y) ((x) MOD (y))
-#define XMMWORD_PTR XMMWORD PTR
-#else
-// GNU assembler doesn't seem to have mod operator
-#define ASM_MOD(x, y) ((x)-((x)/(y))*(y))
-// GAS 2.15 doesn't support XMMWORD PTR. it seems necessary only for MASM
-#define XMMWORD_PTR
-#endif
-
-#if CRYPTOPP_BOOL_X86
-	#define AS_REG_1 ecx
-	#define AS_REG_2 edx
-	#define AS_REG_3 esi
-	#define AS_REG_4 edi
-	#define AS_REG_5 eax
-	#define AS_REG_6 ebx
-	#define AS_REG_7 ebp
-	#define AS_REG_1d ecx
-	#define AS_REG_2d edx
-	#define AS_REG_3d esi
-	#define AS_REG_4d edi
-	#define AS_REG_5d eax
-	#define AS_REG_6d ebx
-	#define AS_REG_7d ebp
-	#define WORD_SZ 4
-	#define WORD_REG(x)	e##x
-	#define WORD_PTR DWORD PTR
-	#define AS_PUSH_IF86(x) AS1(push e##x)
-	#define AS_POP_IF86(x) AS1(pop e##x)
-	#define AS_JCXZ jecxz
-#elif CRYPTOPP_BOOL_X32
-    #define AS_REG_1 ecx
-    #define AS_REG_2 edx
-    #define AS_REG_3 r8d
-    #define AS_REG_4 r9d
-    #define AS_REG_5 eax
-    #define AS_REG_6 r10d
-    #define AS_REG_7 r11d
-    #define AS_REG_1d ecx
-    #define AS_REG_2d edx
-    #define AS_REG_3d r8d
-    #define AS_REG_4d r9d
-    #define AS_REG_5d eax
-    #define AS_REG_6d r10d
-    #define AS_REG_7d r11d
-    #define WORD_SZ 4
-    #define WORD_REG(x)	e##x
-    #define WORD_PTR DWORD PTR
-    #define AS_PUSH_IF86(x) AS1(push r##x)
-    #define AS_POP_IF86(x) AS1(pop r##x)
-    #define AS_JCXZ jecxz
-#elif CRYPTOPP_BOOL_X64
-	#ifdef CRYPTOPP_GENERATE_X64_MASM
-		#define AS_REG_1 rcx
-		#define AS_REG_2 rdx
-		#define AS_REG_3 r8
-		#define AS_REG_4 r9
-		#define AS_REG_5 rax
-		#define AS_REG_6 r10
-		#define AS_REG_7 r11
-		#define AS_REG_1d ecx
-		#define AS_REG_2d edx
-		#define AS_REG_3d r8d
-		#define AS_REG_4d r9d
-		#define AS_REG_5d eax
-		#define AS_REG_6d r10d
-		#define AS_REG_7d r11d
-	#else
-		#define AS_REG_1 rdi
-		#define AS_REG_2 rsi
-		#define AS_REG_3 rdx
-		#define AS_REG_4 rcx
-		#define AS_REG_5 r8
-		#define AS_REG_6 r9
-		#define AS_REG_7 r10
-		#define AS_REG_1d edi
-		#define AS_REG_2d esi
-		#define AS_REG_3d edx
-		#define AS_REG_4d ecx
-		#define AS_REG_5d r8d
-		#define AS_REG_6d r9d
-		#define AS_REG_7d r10d
-	#endif
-	#define WORD_SZ 8
-	#define WORD_REG(x)	r##x
-	#define WORD_PTR QWORD PTR
-	#define AS_PUSH_IF86(x)
-	#define AS_POP_IF86(x)
-	#define AS_JCXZ jrcxz
-#endif
-
-// helper macro for stream cipher output
-#define AS_XMM_OUTPUT4(labelPrefix, inputPtr, outputPtr, x0, x1, x2, x3, t, p0, p1, p2, p3, increment)\
-	AS2(	test	inputPtr, inputPtr)\
-	ASC(	jz,		labelPrefix##3)\
-	AS2(	test	inputPtr, 15)\
-	ASC(	jnz,	labelPrefix##7)\
-	AS2(	pxor	xmm##x0, [inputPtr+p0*16])\
-	AS2(	pxor	xmm##x1, [inputPtr+p1*16])\
-	AS2(	pxor	xmm##x2, [inputPtr+p2*16])\
-	AS2(	pxor	xmm##x3, [inputPtr+p3*16])\
-	AS2(	add		inputPtr, increment*16)\
-	ASC(	jmp,	labelPrefix##3)\
-	ASL(labelPrefix##7)\
-	AS2(	movdqu	xmm##t, [inputPtr+p0*16])\
-	AS2(	pxor	xmm##x0, xmm##t)\
-	AS2(	movdqu	xmm##t, [inputPtr+p1*16])\
-	AS2(	pxor	xmm##x1, xmm##t)\
-	AS2(	movdqu	xmm##t, [inputPtr+p2*16])\
-	AS2(	pxor	xmm##x2, xmm##t)\
-	AS2(	movdqu	xmm##t, [inputPtr+p3*16])\
-	AS2(	pxor	xmm##x3, xmm##t)\
-	AS2(	add		inputPtr, increment*16)\
-	ASL(labelPrefix##3)\
-	AS2(	test	outputPtr, 15)\
-	ASC(	jnz,	labelPrefix##8)\
-	AS2(	movdqa	[outputPtr+p0*16], xmm##x0)\
-	AS2(	movdqa	[outputPtr+p1*16], xmm##x1)\
-	AS2(	movdqa	[outputPtr+p2*16], xmm##x2)\
-	AS2(	movdqa	[outputPtr+p3*16], xmm##x3)\
-	ASC(	jmp,	labelPrefix##9)\
-	ASL(labelPrefix##8)\
-	AS2(	movdqu	[outputPtr+p0*16], xmm##x0)\
-	AS2(	movdqu	[outputPtr+p1*16], xmm##x1)\
-	AS2(	movdqu	[outputPtr+p2*16], xmm##x2)\
-	AS2(	movdqu	[outputPtr+p3*16], xmm##x3)\
-	ASL(labelPrefix##9)\
-	AS2(	add		outputPtr, increment*16)
-
-
-#endif
+#ifndef CRYPTOPP_CPU_H
+#define CRYPTOPP_CPU_H
+
+#include "Common/Tcdefs.h"
+#include "config.h"
+
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+
+#define CRYPTOPP_X86_ASM_AVAILABLE
+#define CRYPTOPP_BOOL_X64 1
+#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 1
+
+#else
+
+#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
+#include <emmintrin.h>
+#endif
+
+#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
+#if defined(__SSSE3__) || defined(__INTEL_COMPILER)
+#ifdef TC_WINDOWS_DRIVER
+extern __m128i _mm_shuffle_epi8 (__m128i a, __m128i b);
+#else
+#include <tmmintrin.h>
+#endif
+#endif
+
+#if defined(__SSE4_1__) || defined(__INTEL_COMPILER)
+#ifdef TC_WINDOWS_DRIVER
+extern int   _mm_extract_epi32(__m128i src, const int ndx);
+extern __m128i _mm_insert_epi32(__m128i dst, int s, const int ndx);
+#else
+#include <smmintrin.h>
+#endif
+#endif
+
+#if (defined(__AES__) && defined(__PCLMUL__)) || defined(__INTEL_COMPILER)
+#ifdef TC_WINDOWS_DRIVER
+extern __m128i _mm_clmulepi64_si128(__m128i v1, __m128i v2, 
+					    const int imm8);
+extern __m128i _mm_aeskeygenassist_si128(__m128i ckey, const int rcon);
+extern __m128i _mm_aesimc_si128(__m128i v);
+extern __m128i _mm_aesenc_si128(__m128i v, __m128i rkey);
+extern __m128i _mm_aesenclast_si128(__m128i v, __m128i rkey);
+extern __m128i _mm_aesdec_si128(__m128i v, __m128i rkey);
+extern __m128i _mm_aesdeclast_si128(__m128i v, __m128i rkey);
+#else
+#include <wmmintrin.h>
+#endif
+#endif
+#endif
+
+#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
+
+#define CRYPTOPP_CPUID_AVAILABLE
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// these should not be used directly
+extern int g_x86DetectionDone;
+extern int g_hasSSSE3;
+extern int g_hasAESNI;
+extern int g_hasCLMUL;
+extern int g_isP4;
+extern uint32 g_cacheLineSize;
+void DetectX86Features(); // must be called at the start of the program/driver
+int CpuId(uint32 input, uint32 *output);
+
+#if CRYPTOPP_BOOL_X64
+#define HasSSE2()	1
+#define HasISSE()	1
+#define HasMMX()	1
+#else
+
+extern int g_hasSSE2;
+extern int g_hasISSE;
+extern int g_hasMMX;
+
+#define HasSSE2()	g_hasSSE2
+#define HasISSE()	g_hasISSE
+#define HasMMX()	g_hasMMX
+
+#endif
+
+#define HasSSSE3() g_hasSSSE3
+#define HasAESNI() g_hasAESNI
+#define HasCLMUL() g_hasCLMUL
+#define IsP4() g_isP4
+#define GetCacheLineSize() g_cacheLineSize
+
+#if defined(__cplusplus)
+}
+#endif
+
+#else
+
+#define GetCacheLineSize()	CRYPTOPP_L1_CACHE_LINE_SIZE
+
+#endif
+
+#endif
+
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+	#define AS1(x) x*newline*
+	#define AS2(x, y) x, y*newline*
+	#define AS3(x, y, z) x, y, z*newline*
+	#define ASS(x, y, a, b, c, d) x, y, a*64+b*16+c*4+d*newline*
+	#define ASL(x) label##x:*newline*
+	#define ASJ(x, y, z) x label##y*newline*
+	#define ASC(x, y) x label##y*newline*
+	#define AS_HEX(y) 0##y##h
+#elif defined(_MSC_VER) || defined(__BORLANDC__)
+	#define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
+	#define AS1(x) __asm {x}
+	#define AS2(x, y) __asm {x, y}
+	#define AS3(x, y, z) __asm {x, y, z}
+	#define ASS(x, y, a, b, c, d) __asm {x, y, (a)*64+(b)*16+(c)*4+(d)}
+	#define ASL(x) __asm {label##x:}
+	#define ASJ(x, y, z) __asm {x label##y}
+	#define ASC(x, y) __asm {x label##y}
+	#define CRYPTOPP_NAKED __declspec(naked)
+	#define AS_HEX(y) 0x##y
+#else
+	#define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
+
+    #if defined(CRYPTOPP_CLANG_VERSION) || defined(CRYPTOPP_APPLE_CLANG_VERSION)
+        #define NEW_LINE "\n"
+        #define INTEL_PREFIX ".intel_syntax;"
+        #define INTEL_NOPREFIX ".intel_syntax;"
+        #define ATT_PREFIX ".att_syntax;"
+        #define ATT_NOPREFIX ".att_syntax;"
+    #else
+        #define NEW_LINE
+        #define INTEL_PREFIX ".intel_syntax prefix;"
+        #define INTEL_NOPREFIX ".intel_syntax noprefix;"
+        #define ATT_PREFIX ".att_syntax prefix;"
+        #define ATT_NOPREFIX ".att_syntax noprefix;"
+        #endif
+
+    // define these in two steps to allow arguments to be expanded
+    #define GNU_AS1(x) #x ";" NEW_LINE
+    #define GNU_AS2(x, y) #x ", " #y ";" NEW_LINE
+    #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";" NEW_LINE
+    #define GNU_ASL(x) "\n" #x ":" NEW_LINE
+    #define GNU_ASJ(x, y, z) #x " " #y #z ";" NEW_LINE
+    #define AS1(x) GNU_AS1(x)
+    #define AS2(x, y) GNU_AS2(x, y)
+    #define AS3(x, y, z) GNU_AS3(x, y, z)
+    #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
+    #define ASL(x) GNU_ASL(x)
+    #define ASJ(x, y, z) GNU_ASJ(x, y, z)
+    #define ASC(x, y) #x " " #y ";"
+    #define CRYPTOPP_NAKED
+    #define AS_HEX(y) 0x##y
+#endif
+
+#define IF0(y)
+#define IF1(y) y
+
+// Should be confined to GCC, but its used to help manage Clang 3.4 compiler error.
+//   Also see LLVM Bug 24232, http://llvm.org/bugs/show_bug.cgi?id=24232 .
+#ifndef INTEL_PREFIX
+#define INTEL_PREFIX
+#endif
+#ifndef INTEL_NOPREFIX
+#define INTEL_NOPREFIX
+#endif
+#ifndef ATT_PREFIX
+#define ATT_PREFIX
+#endif
+#ifndef ATT_NOPREFIX
+#define ATT_NOPREFIX
+#endif
+
+#ifdef CRYPTOPP_GENERATE_X64_MASM
+#define ASM_MOD(x, y) ((x) MOD (y))
+#define XMMWORD_PTR XMMWORD PTR
+#else
+// GNU assembler doesn't seem to have mod operator
+#define ASM_MOD(x, y) ((x)-((x)/(y))*(y))
+// GAS 2.15 doesn't support XMMWORD PTR. it seems necessary only for MASM
+#define XMMWORD_PTR
+#endif
+
+#if CRYPTOPP_BOOL_X86
+	#define AS_REG_1 ecx
+	#define AS_REG_2 edx
+	#define AS_REG_3 esi
+	#define AS_REG_4 edi
+	#define AS_REG_5 eax
+	#define AS_REG_6 ebx
+	#define AS_REG_7 ebp
+	#define AS_REG_1d ecx
+	#define AS_REG_2d edx
+	#define AS_REG_3d esi
+	#define AS_REG_4d edi
+	#define AS_REG_5d eax
+	#define AS_REG_6d ebx
+	#define AS_REG_7d ebp
+	#define WORD_SZ 4
+	#define WORD_REG(x)	e##x
+	#define WORD_PTR DWORD PTR
+	#define AS_PUSH_IF86(x) AS1(push e##x)
+	#define AS_POP_IF86(x) AS1(pop e##x)
+	#define AS_JCXZ jecxz
+#elif CRYPTOPP_BOOL_X32
+    #define AS_REG_1 ecx
+    #define AS_REG_2 edx
+    #define AS_REG_3 r8d
+    #define AS_REG_4 r9d
+    #define AS_REG_5 eax
+    #define AS_REG_6 r10d
+    #define AS_REG_7 r11d
+    #define AS_REG_1d ecx
+    #define AS_REG_2d edx
+    #define AS_REG_3d r8d
+    #define AS_REG_4d r9d
+    #define AS_REG_5d eax
+    #define AS_REG_6d r10d
+    #define AS_REG_7d r11d
+    #define WORD_SZ 4
+    #define WORD_REG(x)	e##x
+    #define WORD_PTR DWORD PTR
+    #define AS_PUSH_IF86(x) AS1(push r##x)
+    #define AS_POP_IF86(x) AS1(pop r##x)
+    #define AS_JCXZ jecxz
+#elif CRYPTOPP_BOOL_X64
+	#ifdef CRYPTOPP_GENERATE_X64_MASM
+		#define AS_REG_1 rcx
+		#define AS_REG_2 rdx
+		#define AS_REG_3 r8
+		#define AS_REG_4 r9
+		#define AS_REG_5 rax
+		#define AS_REG_6 r10
+		#define AS_REG_7 r11
+		#define AS_REG_1d ecx
+		#define AS_REG_2d edx
+		#define AS_REG_3d r8d
+		#define AS_REG_4d r9d
+		#define AS_REG_5d eax
+		#define AS_REG_6d r10d
+		#define AS_REG_7d r11d
+	#else
+		#define AS_REG_1 rdi
+		#define AS_REG_2 rsi
+		#define AS_REG_3 rdx
+		#define AS_REG_4 rcx
+		#define AS_REG_5 r8
+		#define AS_REG_6 r9
+		#define AS_REG_7 r10
+		#define AS_REG_1d edi
+		#define AS_REG_2d esi
+		#define AS_REG_3d edx
+		#define AS_REG_4d ecx
+		#define AS_REG_5d r8d
+		#define AS_REG_6d r9d
+		#define AS_REG_7d r10d
+	#endif
+	#define WORD_SZ 8
+	#define WORD_REG(x)	r##x
+	#define WORD_PTR QWORD PTR
+	#define AS_PUSH_IF86(x)
+	#define AS_POP_IF86(x)
+	#define AS_JCXZ jrcxz
+#endif
+
+// helper macro for stream cipher output
+#define AS_XMM_OUTPUT4(labelPrefix, inputPtr, outputPtr, x0, x1, x2, x3, t, p0, p1, p2, p3, increment)\
+	AS2(	test	inputPtr, inputPtr)\
+	ASC(	jz,		labelPrefix##3)\
+	AS2(	test	inputPtr, 15)\
+	ASC(	jnz,	labelPrefix##7)\
+	AS2(	pxor	xmm##x0, [inputPtr+p0*16])\
+	AS2(	pxor	xmm##x1, [inputPtr+p1*16])\
+	AS2(	pxor	xmm##x2, [inputPtr+p2*16])\
+	AS2(	pxor	xmm##x3, [inputPtr+p3*16])\
+	AS2(	add		inputPtr, increment*16)\
+	ASC(	jmp,	labelPrefix##3)\
+	ASL(labelPrefix##7)\
+	AS2(	movdqu	xmm##t, [inputPtr+p0*16])\
+	AS2(	pxor	xmm##x0, xmm##t)\
+	AS2(	movdqu	xmm##t, [inputPtr+p1*16])\
+	AS2(	pxor	xmm##x1, xmm##t)\
+	AS2(	movdqu	xmm##t, [inputPtr+p2*16])\
+	AS2(	pxor	xmm##x2, xmm##t)\
+	AS2(	movdqu	xmm##t, [inputPtr+p3*16])\
+	AS2(	pxor	xmm##x3, xmm##t)\
+	AS2(	add		inputPtr, increment*16)\
+	ASL(labelPrefix##3)\
+	AS2(	test	outputPtr, 15)\
+	ASC(	jnz,	labelPrefix##8)\
+	AS2(	movdqa	[outputPtr+p0*16], xmm##x0)\
+	AS2(	movdqa	[outputPtr+p1*16], xmm##x1)\
+	AS2(	movdqa	[outputPtr+p2*16], xmm##x2)\
+	AS2(	movdqa	[outputPtr+p3*16], xmm##x3)\
+	ASC(	jmp,	labelPrefix##9)\
+	ASL(labelPrefix##8)\
+	AS2(	movdqu	[outputPtr+p0*16], xmm##x0)\
+	AS2(	movdqu	[outputPtr+p1*16], xmm##x1)\
+	AS2(	movdqu	[outputPtr+p2*16], xmm##x2)\
+	AS2(	movdqu	[outputPtr+p3*16], xmm##x3)\
+	ASL(labelPrefix##9)\
+	AS2(	add		outputPtr, increment*16)
+
+
+#endif