diff options
Diffstat (limited to 'src/Crypto/AesSmall_x86.asm')
-rw-r--r-- | src/Crypto/AesSmall_x86.asm | 2888 |
1 files changed, 1444 insertions, 1444 deletions
diff --git a/src/Crypto/AesSmall_x86.asm b/src/Crypto/AesSmall_x86.asm index fe7dc47b..872aa013 100644 --- a/src/Crypto/AesSmall_x86.asm +++ b/src/Crypto/AesSmall_x86.asm @@ -1,1444 +1,1444 @@ -
-; ---------------------------------------------------------------------------
-; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
-;
-; LICENSE TERMS
-;
-; The free distribution and use of this software is allowed (with or without
-; changes) provided that:
-;
-; 1. source code distributions include the above copyright notice, this
-; list of conditions and the following disclaimer;
-;
-; 2. binary distributions include the above copyright notice, this list
-; of conditions and the following disclaimer in their documentation;
-;
-; 3. the name of the copyright holder is not used to endorse products
-; built using this software without specific written permission.
-;
-; DISCLAIMER
-;
-; This software is provided 'as is' with no explicit or implied warranties
-; in respect of its properties, including, but not limited to, correctness
-; and/or fitness for purpose.
-; ---------------------------------------------------------------------------
-; Issue 20/12/2007
-;
-; This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h
-; and the same define to be set here as well. If AES_V2C is set this file
-; requires the C files aeskey.c and aestab.c for support.
-
-; An AES implementation for x86 processors using the YASM (or NASM) assembler.
-; This is a full assembler implementation covering encryption, decryption and
-; key scheduling. It uses 2k bytes of tables but its encryption and decryption
-; performance is very close to that obtained using large tables. Key schedule
-; expansion is slower for both encryption and decryption but this is likely to
-; be offset by the much smaller load that this version places on the processor
-; cache. I acknowledge the contribution made by Daniel Bernstein to aspects of
-; the design of the AES round function used here.
-;
-; This code provides the standard AES block size (128 bits, 16 bytes) and the
-; three standard AES key sizes (128, 192 and 256 bits). It has the same call
-; interface as my C implementation. The ebx, esi, edi and ebp registers are
-; preserved across calls but eax, ecx and edx and the artihmetic status flags
-; are not. Although this is a full assembler implementation, it can be used
-; in conjunction with my C code which provides faster key scheduling using
-; large tables. In this case aeskey.c should be compiled with ASM_X86_V2C
-; defined. It is also important that the defines below match those used in the
-; C code. This code uses the VC++ register saving conentions; if it is used
-; with another compiler, conventions for using and saving registers may need
-; to be checked (and calling conventions). The YASM command line for the VC++
-; custom build step is:
-;
-; yasm -Xvc -f win32 -D <Z> -o "$(TargetDir)\$(InputName).obj" "$(InputPath)"
-;
-; For the cryptlib build this is (pcg):
-;
-; yasm -Xvc -f win32 -D ASM_X86_V2C -o aescrypt2.obj aes_x86_v2.asm
-;
-; where <Z> is ASM_X86_V2 or ASM_X86_V2C. The calling intefaces are:
-;
-; AES_RETURN aes_encrypt(const unsigned char in_blk[],
-; unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
-;
-; AES_RETURN aes_decrypt(const unsigned char in_blk[],
-; unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
-;
-; AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
-; const aes_encrypt_ctx cx[1]);
-;
-; AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
-; const aes_decrypt_ctx cx[1]);
-;
-; AES_RETURN aes_encrypt_key(const unsigned char key[],
-; unsigned int len, const aes_decrypt_ctx cx[1]);
-;
-; AES_RETURN aes_decrypt_key(const unsigned char key[],
-; unsigned int len, const aes_decrypt_ctx cx[1]);
-;
-; where <NNN> is 128, 102 or 256. In the last two calls the length can be in
-; either bits or bytes.
-
-; The DLL interface must use the _stdcall convention in which the number
-; of bytes of parameter space is added after an @ to the sutine's name.
-; We must also remove our parameters from the stack before return (see
-; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
-
-;
-; Adapted for TrueCrypt:
-; - All tables generated at run-time
-; - Adapted for 16-bit environment
-;
-
-CPU 386
-USE16
-SEGMENT _TEXT PUBLIC CLASS=CODE USE16
-SEGMENT _DATA PUBLIC CLASS=DATA USE16
-
-GROUP DGROUP _TEXT _DATA
-
-extern _aes_dec_tab ; Aestab.c
-extern _aes_enc_tab
-
-; %define DLL_EXPORT
-
-; The size of the code can be reduced by using functions for the encryption
-; and decryption rounds in place of macro expansion
-
-%define REDUCE_CODE_SIZE
-
-; Comment in/out the following lines to obtain the desired subroutines. These
-; selections MUST match those in the C header file aes.h
-
-; %define AES_128 ; define if AES with 128 bit keys is needed
-; %define AES_192 ; define if AES with 192 bit keys is needed
-%define AES_256 ; define if AES with 256 bit keys is needed
-; %define AES_VAR ; define if a variable key size is needed
-%define ENCRYPTION ; define if encryption is needed
-%define DECRYPTION ; define if decryption is needed
-; %define AES_REV_DKS ; define if key decryption schedule is reversed
-
-%ifndef ASM_X86_V2C
-%define ENCRYPTION_KEY_SCHEDULE ; define if encryption key expansion is needed
-%define DECRYPTION_KEY_SCHEDULE ; define if decryption key expansion is needed
-%endif
-
-; The encryption key schedule has the following in memory layout where N is the
-; number of rounds (10, 12 or 14):
-;
-; lo: | input key (round 0) | ; each round is four 32-bit words
-; | encryption round 1 |
-; | encryption round 2 |
-; ....
-; | encryption round N-1 |
-; hi: | encryption round N |
-;
-; The decryption key schedule is normally set up so that it has the same
-; layout as above by actually reversing the order of the encryption key
-; schedule in memory (this happens when AES_REV_DKS is set):
-;
-; lo: | decryption round 0 | = | encryption round N |
-; | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ]
-; | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ]
-; .... ....
-; | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ]
-; hi: | decryption round N | = | input key (round 0) |
-;
-; with rounds except the first and last modified using inv_mix_column()
-; But if AES_REV_DKS is NOT set the order of keys is left as it is for
-; encryption so that it has to be accessed in reverse when used for
-; decryption (although the inverse mix column modifications are done)
-;
-; lo: | decryption round 0 | = | input key (round 0) |
-; | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ]
-; | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ]
-; .... ....
-; | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
-; hi: | decryption round N | = | encryption round N |
-;
-; This layout is faster when the assembler key scheduling provided here
-; is used.
-;
-; End of user defines
-
-%ifdef AES_VAR
-%ifndef AES_128
-%define AES_128
-%endif
-%ifndef AES_192
-%define AES_192
-%endif
-%ifndef AES_256
-%define AES_256
-%endif
-%endif
-
-%ifdef AES_VAR
-%define KS_LENGTH 60
-%elifdef AES_256
-%define KS_LENGTH 60
-%elifdef AES_192
-%define KS_LENGTH 52
-%else
-%define KS_LENGTH 44
-%endif
-
-; These macros implement stack based local variables
-
-%macro save 2
- mov [esp+4*%1],%2
-%endmacro
-
-%macro restore 2
- mov %1,[esp+4*%2]
-%endmacro
-
-%ifdef REDUCE_CODE_SIZE
- %macro mf_call 1
- call %1
- %endmacro
-%else
- %macro mf_call 1
- %1
- %endmacro
-%endif
-
-; the DLL has to implement the _stdcall calling interface on return
-; In this case we have to take our parameters (3 4-byte pointers)
-; off the stack
-
-%define parms 12
-
-%macro do_name 1-2 parms
-%ifndef DLL_EXPORT
- global %1
-%1:
-%else
- global %1@%2
- export %1@%2
-%1@%2:
-%endif
-%endmacro
-
-%macro do_call 1-2 parms
-%ifndef DLL_EXPORT
- call %1
- add esp,%2
-%else
- call %1@%2
-%endif
-%endmacro
-
-%macro do_exit 0-1 parms
-%ifdef DLL_EXPORT
- ret %1
-%else
- ret
-%endif
-%endmacro
-
-; finite field multiplies by {02}, {04} and {08}
-
-%define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
-%define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
-%define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
-
-; finite field multiplies required in table generation
-
-%define f3(x) (f2(x) ^ x)
-%define f9(x) (f8(x) ^ x)
-%define fb(x) (f8(x) ^ f2(x) ^ x)
-%define fd(x) (f8(x) ^ f4(x) ^ x)
-%define fe(x) (f8(x) ^ f4(x) ^ f2(x))
-
-%define etab_0(x) [_aes_enc_tab+4+8*x]
-%define etab_1(x) [_aes_enc_tab+3+8*x]
-%define etab_2(x) [_aes_enc_tab+2+8*x]
-%define etab_3(x) [_aes_enc_tab+1+8*x]
-%define etab_b(x) byte [_aes_enc_tab+1+8*x] ; used with movzx for 0x000000xx
-%define etab_w(x) word [_aes_enc_tab+8*x] ; used with movzx for 0x0000xx00
-
-%define btab_0(x) [_aes_enc_tab+6+8*x]
-%define btab_1(x) [_aes_enc_tab+5+8*x]
-%define btab_2(x) [_aes_enc_tab+4+8*x]
-%define btab_3(x) [_aes_enc_tab+3+8*x]
-
-; ROUND FUNCTION. Build column[2] on ESI and column[3] on EDI that have the
-; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX.
-;
-; Input:
-;
-; EAX column[0]
-; EBX column[1]
-; ECX column[2]
-; EDX column[3]
-; ESI column key[round][2]
-; EDI column key[round][3]
-; EBP scratch
-;
-; Output:
-;
-; EBP column[0] unkeyed
-; EBX column[1] unkeyed
-; ESI column[2] keyed
-; EDI column[3] keyed
-; EAX scratch
-; ECX scratch
-; EDX scratch
-
-%macro rnd_fun 2
-
- rol ebx,16
- %1 esi, cl, 0, ebp
- %1 esi, dh, 1, ebp
- %1 esi, bh, 3, ebp
- %1 edi, dl, 0, ebp
- %1 edi, ah, 1, ebp
- %1 edi, bl, 2, ebp
- %2 ebp, al, 0, ebp
- shr ebx,16
- and eax,0xffff0000
- or eax,ebx
- shr edx,16
- %1 ebp, ah, 1, ebx
- %1 ebp, dh, 3, ebx
- %2 ebx, dl, 2, ebx
- %1 ebx, ch, 1, edx
- %1 ebx, al, 0, edx
- shr eax,16
- shr ecx,16
- %1 ebp, cl, 2, edx
- %1 edi, ch, 3, edx
- %1 esi, al, 2, edx
- %1 ebx, ah, 3, edx
-
-%endmacro
-
-; Basic MOV and XOR Operations for normal rounds
-
-%macro nr_xor 4
- movzx %4,%2
- xor %1,etab_%3(%4)
-%endmacro
-
-%macro nr_mov 4
- movzx %4,%2
- mov %1,etab_%3(%4)
-%endmacro
-
-; Basic MOV and XOR Operations for last round
-
-%if 1
-
- %macro lr_xor 4
- movzx %4,%2
- movzx %4,etab_b(%4)
- %if %3 != 0
- shl %4,8*%3
- %endif
- xor %1,%4
- %endmacro
-
- %macro lr_mov 4
- movzx %4,%2
- movzx %1,etab_b(%4)
- %if %3 != 0
- shl %1,8*%3
- %endif
- %endmacro
-
-%else ; less effective but worth leaving as an option
-
- %macro lr_xor 4
- movzx %4,%2
- mov %4,btab_%3(%4)
- and %4,0x000000ff << 8 * %3
- xor %1,%4
- %endmacro
-
- %macro lr_mov 4
- movzx %4,%2
- mov %1,btab_%3(%4)
- and %1,0x000000ff << 8 * %3
- %endmacro
-
-%endif
-
-; Apply S-Box to the 4 bytes in a 32-bit word and rotate byte positions
-
-%ifdef REDUCE_CODE_SIZE
-
-l3s_col:
- movzx ecx,al ; in eax
- movzx ecx, etab_b(ecx) ; out eax
- xor edx,ecx ; scratch ecx,edx
- movzx ecx,ah
- movzx ecx, etab_b(ecx)
- shl ecx,8
- xor edx,ecx
- shr eax,16
- movzx ecx,al
- movzx ecx, etab_b(ecx)
- shl ecx,16
- xor edx,ecx
- movzx ecx,ah
- movzx ecx, etab_b(ecx)
- shl ecx,24
- xor edx,ecx
- mov eax,edx
- ret
-
-%else
-
-%macro l3s_col 0
-
- movzx ecx,al ; in eax
- movzx ecx, etab_b(ecx) ; out eax
- xor edx,ecx ; scratch ecx,edx
- movzx ecx,ah
- movzx ecx, etab_b(ecx)
- shl ecx,8
- xor edx,ecx
- shr eax,16
- movzx ecx,al
- movzx ecx, etab_b(ecx)
- shl ecx,16
- xor edx,ecx
- movzx ecx,ah
- movzx ecx, etab_b(ecx)
- shl ecx,24
- xor edx,ecx
- mov eax,edx
-
-%endmacro
-
-%endif
-
-; offsets to parameters
-
-in_blk equ 2 ; input byte array address parameter
-out_blk equ 4 ; output byte array address parameter
-ctx equ 6 ; AES context structure
-stk_spc equ 20 ; stack space
-
-%ifdef ENCRYPTION
-
-; %define ENCRYPTION_TABLE
-
-%ifdef REDUCE_CODE_SIZE
-
-enc_round:
- sub sp, 2
- add ebp,16
- save 1,ebp
- mov esi,[ebp+8]
- mov edi,[ebp+12]
-
- rnd_fun nr_xor, nr_mov
-
- mov eax,ebp
- mov ecx,esi
- mov edx,edi
- restore ebp,1
- xor eax,[ebp]
- xor ebx,[ebp+4]
- add sp, 2
- ret
-
-%else
-
-%macro enc_round 0
-
- add ebp,16
- save 0,ebp
- mov esi,[ebp+8]
- mov edi,[ebp+12]
-
- rnd_fun nr_xor, nr_mov
-
- mov eax,ebp
- mov ecx,esi
- mov edx,edi
- restore ebp,0
- xor eax,[ebp]
- xor ebx,[ebp+4]
-
-%endmacro
-
-%endif
-
-%macro enc_last_round 0
-
- add ebp,16
- save 0,ebp
- mov esi,[ebp+8]
- mov edi,[ebp+12]
-
- rnd_fun lr_xor, lr_mov
-
- mov eax,ebp
- restore ebp,0
- xor eax,[ebp]
- xor ebx,[ebp+4]
-
-%endmacro
-
- section _TEXT
-
-; AES Encryption Subroutine
-
- do_name _aes_encrypt,12
-
- mov ax, sp
- movzx esp, ax
-
- sub esp,stk_spc
- mov [esp+16],ebp
- mov [esp+12],ebx
- mov [esp+ 8],esi
- mov [esp+ 4],edi
-
- movzx esi,word [esp+in_blk+stk_spc] ; input pointer
- mov eax,[esi ]
- mov ebx,[esi+ 4]
- mov ecx,[esi+ 8]
- mov edx,[esi+12]
-
- movzx ebp,word [esp+ctx+stk_spc] ; key pointer
- movzx edi,byte [ebp+4*KS_LENGTH]
- xor eax,[ebp ]
- xor ebx,[ebp+ 4]
- xor ecx,[ebp+ 8]
- xor edx,[ebp+12]
-
-; determine the number of rounds
-
-%ifndef AES_256
- cmp edi,10*16
- je .3
- cmp edi,12*16
- je .2
- cmp edi,14*16
- je .1
- mov eax,-1
- jmp .5
-%endif
-
-.1: mf_call enc_round
- mf_call enc_round
-.2: mf_call enc_round
- mf_call enc_round
-.3: mf_call enc_round
- mf_call enc_round
- mf_call enc_round
- mf_call enc_round
- mf_call enc_round
- mf_call enc_round
- mf_call enc_round
- mf_call enc_round
- mf_call enc_round
- enc_last_round
-
- movzx edx,word [esp+out_blk+stk_spc]
- mov [edx],eax
- mov [edx+4],ebx
- mov [edx+8],esi
- mov [edx+12],edi
- xor eax,eax
-
-.5: mov ebp,[esp+16]
- mov ebx,[esp+12]
- mov esi,[esp+ 8]
- mov edi,[esp+ 4]
- add esp,stk_spc
- do_exit 12
-
-%endif
-
-%macro f_key 2
-
- push ecx
- push edx
- mov edx,esi
- ror eax,8
- mf_call l3s_col
- mov esi,eax
- pop edx
- pop ecx
- xor esi,rc_val
-
- mov [ebp+%1*%2],esi
- xor edi,esi
- mov [ebp+%1*%2+4],edi
- xor ecx,edi
- mov [ebp+%1*%2+8],ecx
- xor edx,ecx
- mov [ebp+%1*%2+12],edx
- mov eax,edx
-
-%if %2 == 24
-
-%if %1 < 7
- xor eax,[ebp+%1*%2+16-%2]
- mov [ebp+%1*%2+16],eax
- xor eax,[ebp+%1*%2+20-%2]
- mov [ebp+%1*%2+20],eax
-%endif
-
-%elif %2 == 32
-
-%if %1 < 6
- push ecx
- push edx
- mov edx,[ebp+%1*%2+16-%2]
- mf_call l3s_col
- pop edx
- pop ecx
- mov [ebp+%1*%2+16],eax
- xor eax,[ebp+%1*%2+20-%2]
- mov [ebp+%1*%2+20],eax
- xor eax,[ebp+%1*%2+24-%2]
- mov [ebp+%1*%2+24],eax
- xor eax,[ebp+%1*%2+28-%2]
- mov [ebp+%1*%2+28],eax
-%endif
-
-%endif
-
-%assign rc_val f2(rc_val)
-
-%endmacro
-
-%ifdef ENCRYPTION_KEY_SCHEDULE
-
-%ifdef AES_128
-
-%ifndef ENCRYPTION_TABLE
-; %define ENCRYPTION_TABLE
-%endif
-
-%assign rc_val 1
-
- do_name _aes_encrypt_key128,8
-
- push ebp
- push ebx
- push esi
- push edi
-
- mov ebp,[esp+24]
- mov [ebp+4*KS_LENGTH],dword 10*16
- mov ebx,[esp+20]
-
- mov esi,[ebx]
- mov [ebp],esi
- mov edi,[ebx+4]
- mov [ebp+4],edi
- mov ecx,[ebx+8]
- mov [ebp+8],ecx
- mov edx,[ebx+12]
- mov [ebp+12],edx
- add ebp,16
- mov eax,edx
-
- f_key 0,16 ; 11 * 4 = 44 unsigned longs
- f_key 1,16 ; 4 + 4 * 10 generated = 44
- f_key 2,16
- f_key 3,16
- f_key 4,16
- f_key 5,16
- f_key 6,16
- f_key 7,16
- f_key 8,16
- f_key 9,16
-
- pop edi
- pop esi
- pop ebx
- pop ebp
- xor eax,eax
- do_exit 8
-
-%endif
-
-%ifdef AES_192
-
-%ifndef ENCRYPTION_TABLE
-; %define ENCRYPTION_TABLE
-%endif
-
-%assign rc_val 1
-
- do_name _aes_encrypt_key192,8
-
- push ebp
- push ebx
- push esi
- push edi
-
- mov ebp,[esp+24]
- mov [ebp+4*KS_LENGTH],dword 12 * 16
- mov ebx,[esp+20]
-
- mov esi,[ebx]
- mov [ebp],esi
- mov edi,[ebx+4]
- mov [ebp+4],edi
- mov ecx,[ebx+8]
- mov [ebp+8],ecx
- mov edx,[ebx+12]
- mov [ebp+12],edx
- mov eax,[ebx+16]
- mov [ebp+16],eax
- mov eax,[ebx+20]
- mov [ebp+20],eax
- add ebp,24
-
- f_key 0,24 ; 13 * 4 = 52 unsigned longs
- f_key 1,24 ; 6 + 6 * 8 generated = 54
- f_key 2,24
- f_key 3,24
- f_key 4,24
- f_key 5,24
- f_key 6,24
- f_key 7,24
-
- pop edi
- pop esi
- pop ebx
- pop ebp
- xor eax,eax
- do_exit 8
-
-%endif
-
-%ifdef AES_256
-
-%ifndef ENCRYPTION_TABLE
-; %define ENCRYPTION_TABLE
-%endif
-
-%assign rc_val 1
-
- do_name _aes_encrypt_key256,8
-
- mov ax, sp
- movzx esp, ax
-
- push ebp
- push ebx
- push esi
- push edi
-
- movzx ebp, word [esp+20] ; ks
- mov [ebp+4*KS_LENGTH],dword 14 * 16
- movzx ebx, word [esp+18] ; key
-
- mov esi,[ebx]
- mov [ebp],esi
- mov edi,[ebx+4]
- mov [ebp+4],edi
- mov ecx,[ebx+8]
- mov [ebp+8],ecx
- mov edx,[ebx+12]
- mov [ebp+12],edx
- mov eax,[ebx+16]
- mov [ebp+16],eax
- mov eax,[ebx+20]
- mov [ebp+20],eax
- mov eax,[ebx+24]
- mov [ebp+24],eax
- mov eax,[ebx+28]
- mov [ebp+28],eax
- add ebp,32
-
- f_key 0,32 ; 15 * 4 = 60 unsigned longs
- f_key 1,32 ; 8 + 8 * 7 generated = 64
- f_key 2,32
- f_key 3,32
- f_key 4,32
- f_key 5,32
- f_key 6,32
-
- pop edi
- pop esi
- pop ebx
- pop ebp
- xor eax,eax
- do_exit 8
-
-%endif
-
-%ifdef AES_VAR
-
-%ifndef ENCRYPTION_TABLE
-; %define ENCRYPTION_TABLE
-%endif
-
- do_name _aes_encrypt_key,12
-
- mov ecx,[esp+4]
- mov eax,[esp+8]
- mov edx,[esp+12]
- push edx
- push ecx
-
- cmp eax,16
- je .1
- cmp eax,128
- je .1
-
- cmp eax,24
- je .2
- cmp eax,192
- je .2
-
- cmp eax,32
- je .3
- cmp eax,256
- je .3
- mov eax,-1
- add esp,8
- do_exit 12
-
-.1: do_call _aes_encrypt_key128,8
- do_exit 12
-.2: do_call _aes_encrypt_key192,8
- do_exit 12
-.3: do_call _aes_encrypt_key256,8
- do_exit 12
-
-%endif
-
-%endif
-
-%ifdef ENCRYPTION_TABLE
-
-; S-box data - 256 entries
-
- section _DATA
-
-%define u8(x) 0, x, x, f3(x), f2(x), x, x, f3(x)
-
-_aes_enc_tab:
- db u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5)
- db u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76)
- db u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0)
- db u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0)
- db u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc)
- db u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15)
- db u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a)
- db u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75)
- db u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0)
- db u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84)
- db u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b)
- db u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf)
- db u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85)
- db u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8)
- db u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5)
- db u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2)
- db u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17)
- db u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73)
- db u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88)
- db u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb)
- db u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c)
- db u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79)
- db u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9)
- db u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08)
- db u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6)
- db u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a)
- db u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e)
- db u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e)
- db u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94)
- db u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf)
- db u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68)
- db u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16)
-
-%endif
-
-%ifdef DECRYPTION
-
-; %define DECRYPTION_TABLE
-
-%define dtab_0(x) [_aes_dec_tab+ 8*x]
-%define dtab_1(x) [_aes_dec_tab+3+8*x]
-%define dtab_2(x) [_aes_dec_tab+2+8*x]
-%define dtab_3(x) [_aes_dec_tab+1+8*x]
-%define dtab_x(x) byte [_aes_dec_tab+7+8*x]
-
-%macro irn_fun 2
-
- rol eax,16
- %1 esi, cl, 0, ebp
- %1 esi, bh, 1, ebp
- %1 esi, al, 2, ebp
- %1 edi, dl, 0, ebp
- %1 edi, ch, 1, ebp
- %1 edi, ah, 3, ebp
- %2 ebp, bl, 0, ebp
- shr eax,16
- and ebx,0xffff0000
- or ebx,eax
- shr ecx,16
- %1 ebp, bh, 1, eax
- %1 ebp, ch, 3, eax
- %2 eax, cl, 2, ecx
- %1 eax, bl, 0, ecx
- %1 eax, dh, 1, ecx
- shr ebx,16
- shr edx,16
- %1 esi, dh, 3, ecx
- %1 ebp, dl, 2, ecx
- %1 eax, bh, 3, ecx
- %1 edi, bl, 2, ecx
-
-%endmacro
-
-; Basic MOV and XOR Operations for normal rounds
-
-%macro ni_xor 4
- movzx %4,%2
- xor %1,dtab_%3(%4)
-%endmacro
-
-%macro ni_mov 4
- movzx %4,%2
- mov %1,dtab_%3(%4)
-%endmacro
-
-; Basic MOV and XOR Operations for last round
-
-%macro li_xor 4
- movzx %4,%2
- movzx %4,dtab_x(%4)
-%if %3 != 0
- shl %4,8*%3
-%endif
- xor %1,%4
-%endmacro
-
-%macro li_mov 4
- movzx %4,%2
- movzx %1,dtab_x(%4)
-%if %3 != 0
- shl %1,8*%3
-%endif
-%endmacro
-
-%ifdef REDUCE_CODE_SIZE
-
-dec_round:
- sub sp, 2
-%ifdef AES_REV_DKS
- add ebp,16
-%else
- sub ebp,16
-%endif
- save 1,ebp
- mov esi,[ebp+8]
- mov edi,[ebp+12]
-
- irn_fun ni_xor, ni_mov
-
- mov ebx,ebp
- mov ecx,esi
- mov edx,edi
- restore ebp,1
- xor eax,[ebp]
- xor ebx,[ebp+4]
- add sp, 2
- ret
-
-%else
-
-%macro dec_round 0
-
-%ifdef AES_REV_DKS
- add ebp,16
-%else
- sub ebp,16
-%endif
- save 0,ebp
- mov esi,[ebp+8]
- mov edi,[ebp+12]
-
- irn_fun ni_xor, ni_mov
-
- mov ebx,ebp
- mov ecx,esi
- mov edx,edi
- restore ebp,0
- xor eax,[ebp]
- xor ebx,[ebp+4]
-
-%endmacro
-
-%endif
-
-%macro dec_last_round 0
-
-%ifdef AES_REV_DKS
- add ebp,16
-%else
- sub ebp,16
-%endif
- save 0,ebp
- mov esi,[ebp+8]
- mov edi,[ebp+12]
-
- irn_fun li_xor, li_mov
-
- mov ebx,ebp
- restore ebp,0
- xor eax,[ebp]
- xor ebx,[ebp+4]
-
-%endmacro
-
- section _TEXT
-
-; AES Decryption Subroutine
-
- do_name _aes_decrypt,12
-
- mov ax, sp
- movzx esp, ax
-
- sub esp,stk_spc
- mov [esp+16],ebp
- mov [esp+12],ebx
- mov [esp+ 8],esi
- mov [esp+ 4],edi
-
-; input four columns and xor in first round key
-
- movzx esi,word [esp+in_blk+stk_spc] ; input pointer
- mov eax,[esi ]
- mov ebx,[esi+ 4]
- mov ecx,[esi+ 8]
- mov edx,[esi+12]
- lea esi,[esi+16]
-
- movzx ebp, word [esp+ctx+stk_spc] ; key pointer
- movzx edi,byte[ebp+4*KS_LENGTH]
-%ifndef AES_REV_DKS ; if decryption key schedule is not reversed
- lea ebp,[ebp+edi] ; we have to access it from the top down
-%endif
- xor eax,[ebp ] ; key schedule
- xor ebx,[ebp+ 4]
- xor ecx,[ebp+ 8]
- xor edx,[ebp+12]
-
-; determine the number of rounds
-
-%ifndef AES_256
- cmp edi,10*16
- je .3
- cmp edi,12*16
- je .2
- cmp edi,14*16
- je .1
- mov eax,-1
- jmp .5
-%endif
-
-.1: mf_call dec_round
- mf_call dec_round
-.2: mf_call dec_round
- mf_call dec_round
-.3: mf_call dec_round
- mf_call dec_round
- mf_call dec_round
- mf_call dec_round
- mf_call dec_round
- mf_call dec_round
- mf_call dec_round
- mf_call dec_round
- mf_call dec_round
- dec_last_round
-
-; move final values to the output array.
-
- movzx ebp,word [esp+out_blk+stk_spc]
- mov [ebp],eax
- mov [ebp+4],ebx
- mov [ebp+8],esi
- mov [ebp+12],edi
- xor eax,eax
-
-.5: mov ebp,[esp+16]
- mov ebx,[esp+12]
- mov esi,[esp+ 8]
- mov edi,[esp+ 4]
- add esp,stk_spc
- do_exit 12
-
-%endif
-
-%ifdef REDUCE_CODE_SIZE
-
-inv_mix_col:
- movzx ecx,dl ; input eax, edx
- movzx ecx,etab_b(ecx) ; output eax
- mov eax,dtab_0(ecx) ; used ecx
- movzx ecx,dh
- shr edx,16
- movzx ecx,etab_b(ecx)
- xor eax,dtab_1(ecx)
- movzx ecx,dl
- movzx ecx,etab_b(ecx)
- xor eax,dtab_2(ecx)
- movzx ecx,dh
- movzx ecx,etab_b(ecx)
- xor eax,dtab_3(ecx)
- ret
-
-%else
-
-%macro inv_mix_col 0
-
- movzx ecx,dl ; input eax, edx
- movzx ecx,etab_b(ecx) ; output eax
- mov eax,dtab_0(ecx) ; used ecx
- movzx ecx,dh
- shr edx,16
- movzx ecx,etab_b(ecx)
- xor eax,dtab_1(ecx)
- movzx ecx,dl
- movzx ecx,etab_b(ecx)
- xor eax,dtab_2(ecx)
- movzx ecx,dh
- movzx ecx,etab_b(ecx)
- xor eax,dtab_3(ecx)
-
-%endmacro
-
-%endif
-
-%ifdef DECRYPTION_KEY_SCHEDULE
-
-%ifdef AES_128
-
-%ifndef DECRYPTION_TABLE
-; %define DECRYPTION_TABLE
-%endif
-
- do_name _aes_decrypt_key128,8
-
- push ebp
- push ebx
- push esi
- push edi
- mov eax,[esp+24] ; context
- mov edx,[esp+20] ; key
- push eax
- push edx
- do_call _aes_encrypt_key128,8 ; generate expanded encryption key
- mov eax,10*16
- mov esi,[esp+24] ; pointer to first round key
- lea edi,[esi+eax] ; pointer to last round key
- add esi,32
- ; the inverse mix column transformation
- mov edx,[esi-16] ; needs to be applied to all round keys
- mf_call inv_mix_col ; except first and last. Hence start by
- mov [esi-16],eax ; transforming the four sub-keys in the
- mov edx,[esi-12] ; second round key
- mf_call inv_mix_col
- mov [esi-12],eax ; transformations for subsequent rounds
- mov edx,[esi-8] ; can then be made more efficient by
- mf_call inv_mix_col ; noting that for three of the four sub-keys
- mov [esi-8],eax ; in the encryption round key ek[r]:
- mov edx,[esi-4] ;
- mf_call inv_mix_col ; ek[r][n] = ek[r][n-1] ^ ek[r-1][n]
- mov [esi-4],eax ;
- ; where n is 1..3. Hence the corresponding
-.0: mov edx,[esi] ; subkeys in the decryption round key dk[r]
- mf_call inv_mix_col ; also obey since inv_mix_col is linear in
- mov [esi],eax ; GF(256):
- xor eax,[esi-12] ;
- mov [esi+4],eax ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n]
- xor eax,[esi-8] ;
- mov [esi+8],eax ; So we only need one inverse mix column
- xor eax,[esi-4] ; operation (n = 0) for each four word cycle
- mov [esi+12],eax ; in the expanded key.
- add esi,16
- cmp edi,esi
- jg .0
- jmp dec_end
-
-%endif
-
-%ifdef AES_192
-
-%ifndef DECRYPTION_TABLE
-; %define DECRYPTION_TABLE
-%endif
-
- do_name _aes_decrypt_key192,8
-
- push ebp
- push ebx
- push esi
- push edi
- mov eax,[esp+24] ; context
- mov edx,[esp+20] ; key
- push eax
- push edx
- do_call _aes_encrypt_key192,8 ; generate expanded encryption key
- mov eax,12*16
- mov esi,[esp+24] ; first round key
- lea edi,[esi+eax] ; last round key
- add esi,48 ; the first 6 words are the key, of
- ; which the top 2 words are part of
- mov edx,[esi-32] ; the second round key and hence
- mf_call inv_mix_col ; need to be modified. After this we
- mov [esi-32],eax ; need to do a further six values prior
- mov edx,[esi-28] ; to using a more efficient technique
- mf_call inv_mix_col ; based on:
- mov [esi-28],eax ;
- ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n]
- mov edx,[esi-24] ;
- mf_call inv_mix_col ; for n = 1 .. 5 where the key expansion
- mov [esi-24],eax ; cycle is now 6 words long
- mov edx,[esi-20]
- mf_call inv_mix_col
- mov [esi-20],eax
- mov edx,[esi-16]
- mf_call inv_mix_col
- mov [esi-16],eax
- mov edx,[esi-12]
- mf_call inv_mix_col
- mov [esi-12],eax
- mov edx,[esi-8]
- mf_call inv_mix_col
- mov [esi-8],eax
- mov edx,[esi-4]
- mf_call inv_mix_col
- mov [esi-4],eax
-
-.0: mov edx,[esi] ; the expanded key is 13 * 4 = 44 32-bit words
- mf_call inv_mix_col ; of which 11 * 4 = 44 have to be modified
- mov [esi],eax ; using inv_mix_col. We have already done 8
- xor eax,[esi-20] ; of these so 36 are left - hence we need
- mov [esi+4],eax ; exactly 6 loops of six here
- xor eax,[esi-16]
- mov [esi+8],eax
- xor eax,[esi-12]
- mov [esi+12],eax
- xor eax,[esi-8]
- mov [esi+16],eax
- xor eax,[esi-4]
- mov [esi+20],eax
- add esi,24
- cmp edi,esi
- jg .0
- jmp dec_end
-
-%endif
-
-%ifdef AES_256
-
-%ifndef DECRYPTION_TABLE
-; %define DECRYPTION_TABLE
-%endif
-
- do_name _aes_decrypt_key256,8
-
- mov ax, sp
- movzx esp, ax
- push ebp
- push ebx
- push esi
- push edi
-
- movzx eax, word [esp+20] ; ks
- movzx edx, word [esp+18] ; key
- push ax
- push dx
- do_call _aes_encrypt_key256,4 ; generate expanded encryption key
- mov eax,14*16
- movzx esi, word [esp+20] ; ks
- lea edi,[esi+eax]
- add esi,64
-
- mov edx,[esi-48] ; the primary key is 8 words, of which
- mf_call inv_mix_col ; the top four require modification
- mov [esi-48],eax
- mov edx,[esi-44]
- mf_call inv_mix_col
- mov [esi-44],eax
- mov edx,[esi-40]
- mf_call inv_mix_col
- mov [esi-40],eax
- mov edx,[esi-36]
- mf_call inv_mix_col
- mov [esi-36],eax
-
- mov edx,[esi-32] ; the encryption key expansion cycle is
- mf_call inv_mix_col ; now eight words long so we need to
- mov [esi-32],eax ; start by doing one complete block
- mov edx,[esi-28]
- mf_call inv_mix_col
- mov [esi-28],eax
- mov edx,[esi-24]
- mf_call inv_mix_col
- mov [esi-24],eax
- mov edx,[esi-20]
- mf_call inv_mix_col
- mov [esi-20],eax
- mov edx,[esi-16]
- mf_call inv_mix_col
- mov [esi-16],eax
- mov edx,[esi-12]
- mf_call inv_mix_col
- mov [esi-12],eax
- mov edx,[esi-8]
- mf_call inv_mix_col
- mov [esi-8],eax
- mov edx,[esi-4]
- mf_call inv_mix_col
- mov [esi-4],eax
-
-.0: mov edx,[esi] ; we can now speed up the remaining
- mf_call inv_mix_col ; rounds by using the technique
- mov [esi],eax ; outlined earlier. But note that
- xor eax,[esi-28] ; there is one extra inverse mix
- mov [esi+4],eax ; column operation as the 256 bit
- xor eax,[esi-24] ; key has an extra non-linear step
- mov [esi+8],eax ; for the midway element.
- xor eax,[esi-20]
- mov [esi+12],eax ; the expanded key is 15 * 4 = 60
- mov edx,[esi+16] ; 32-bit words of which 52 need to
- mf_call inv_mix_col ; be modified. We have already done
- mov [esi+16],eax ; 12 so 40 are left - which means
- xor eax,[esi-12] ; that we need exactly 5 loops of 8
- mov [esi+20],eax
- xor eax,[esi-8]
- mov [esi+24],eax
- xor eax,[esi-4]
- mov [esi+28],eax
- add esi,32
- cmp edi,esi
- jg .0
-
-%endif
-
-dec_end:
-
-%ifdef AES_REV_DKS
-
- movzx esi,word [esp+20] ; this reverses the order of the
-.1: mov eax,[esi] ; round keys if required
- mov ebx,[esi+4]
- mov ebp,[edi]
- mov edx,[edi+4]
- mov [esi],ebp
- mov [esi+4],edx
- mov [edi],eax
- mov [edi+4],ebx
-
- mov eax,[esi+8]
- mov ebx,[esi+12]
- mov ebp,[edi+8]
- mov edx,[edi+12]
- mov [esi+8],ebp
- mov [esi+12],edx
- mov [edi+8],eax
- mov [edi+12],ebx
-
- add esi,16
- sub edi,16
- cmp edi,esi
- jg .1
-
-%endif
-
- pop edi
- pop esi
- pop ebx
- pop ebp
- xor eax,eax
- do_exit 8
-
-%ifdef AES_VAR
-
- do_name _aes_decrypt_key,12
-
- mov ecx,[esp+4]
- mov eax,[esp+8]
- mov edx,[esp+12]
- push edx
- push ecx
-
- cmp eax,16
- je .1
- cmp eax,128
- je .1
-
- cmp eax,24
- je .2
- cmp eax,192
- je .2
-
- cmp eax,32
- je .3
- cmp eax,256
- je .3
- mov eax,-1
- add esp,8
- do_exit 12
-
-.1: do_call _aes_decrypt_key128,8
- do_exit 12
-.2: do_call _aes_decrypt_key192,8
- do_exit 12
-.3: do_call _aes_decrypt_key256,8
- do_exit 12
-
-%endif
-
-%endif
-
-%ifdef DECRYPTION_TABLE
-
-; Inverse S-box data - 256 entries
-
- section _DATA
-
-%define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
-
-_aes_dec_tab:
- db v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38)
- db v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb)
- db v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87)
- db v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb)
- db v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d)
- db v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e)
- db v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2)
- db v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25)
- db v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16)
- db v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92)
- db v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda)
- db v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84)
- db v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a)
- db v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06)
- db v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02)
- db v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b)
- db v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea)
- db v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73)
- db v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85)
- db v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e)
- db v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89)
- db v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b)
- db v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20)
- db v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4)
- db v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31)
- db v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f)
- db v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d)
- db v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef)
- db v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0)
- db v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61)
- db v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26)
- db v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d)
-
-%endif
+ +; --------------------------------------------------------------------------- +; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. +; +; LICENSE TERMS +; +; The free distribution and use of this software is allowed (with or without +; changes) provided that: +; +; 1. source code distributions include the above copyright notice, this +; list of conditions and the following disclaimer; +; +; 2. binary distributions include the above copyright notice, this list +; of conditions and the following disclaimer in their documentation; +; +; 3. the name of the copyright holder is not used to endorse products +; built using this software without specific written permission. +; +; DISCLAIMER +; +; This software is provided 'as is' with no explicit or implied warranties +; in respect of its properties, including, but not limited to, correctness +; and/or fitness for purpose. +; --------------------------------------------------------------------------- +; Issue 20/12/2007 +; +; This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h +; and the same define to be set here as well. If AES_V2C is set this file +; requires the C files aeskey.c and aestab.c for support. + +; An AES implementation for x86 processors using the YASM (or NASM) assembler. +; This is a full assembler implementation covering encryption, decryption and +; key scheduling. It uses 2k bytes of tables but its encryption and decryption +; performance is very close to that obtained using large tables. Key schedule +; expansion is slower for both encryption and decryption but this is likely to +; be offset by the much smaller load that this version places on the processor +; cache. I acknowledge the contribution made by Daniel Bernstein to aspects of +; the design of the AES round function used here. +; +; This code provides the standard AES block size (128 bits, 16 bytes) and the +; three standard AES key sizes (128, 192 and 256 bits). It has the same call +; interface as my C implementation. The ebx, esi, edi and ebp registers are +; preserved across calls but eax, ecx and edx and the artihmetic status flags +; are not. Although this is a full assembler implementation, it can be used +; in conjunction with my C code which provides faster key scheduling using +; large tables. In this case aeskey.c should be compiled with ASM_X86_V2C +; defined. It is also important that the defines below match those used in the +; C code. This code uses the VC++ register saving conentions; if it is used +; with another compiler, conventions for using and saving registers may need +; to be checked (and calling conventions). The YASM command line for the VC++ +; custom build step is: +; +; yasm -Xvc -f win32 -D <Z> -o "$(TargetDir)\$(InputName).obj" "$(InputPath)" +; +; For the cryptlib build this is (pcg): +; +; yasm -Xvc -f win32 -D ASM_X86_V2C -o aescrypt2.obj aes_x86_v2.asm +; +; where <Z> is ASM_X86_V2 or ASM_X86_V2C. The calling intefaces are: +; +; AES_RETURN aes_encrypt(const unsigned char in_blk[], +; unsigned char out_blk[], const aes_encrypt_ctx cx[1]); +; +; AES_RETURN aes_decrypt(const unsigned char in_blk[], +; unsigned char out_blk[], const aes_decrypt_ctx cx[1]); +; +; AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[], +; const aes_encrypt_ctx cx[1]); +; +; AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[], +; const aes_decrypt_ctx cx[1]); +; +; AES_RETURN aes_encrypt_key(const unsigned char key[], +; unsigned int len, const aes_decrypt_ctx cx[1]); +; +; AES_RETURN aes_decrypt_key(const unsigned char key[], +; unsigned int len, const aes_decrypt_ctx cx[1]); +; +; where <NNN> is 128, 102 or 256. In the last two calls the length can be in +; either bits or bytes. + +; The DLL interface must use the _stdcall convention in which the number +; of bytes of parameter space is added after an @ to the sutine's name. +; We must also remove our parameters from the stack before return (see +; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version. + +; +; Adapted for TrueCrypt: +; - All tables generated at run-time +; - Adapted for 16-bit environment +; + +CPU 386 +USE16 +SEGMENT _TEXT PUBLIC CLASS=CODE USE16 +SEGMENT _DATA PUBLIC CLASS=DATA USE16 + +GROUP DGROUP _TEXT _DATA + +extern _aes_dec_tab ; Aestab.c +extern _aes_enc_tab + +; %define DLL_EXPORT + +; The size of the code can be reduced by using functions for the encryption +; and decryption rounds in place of macro expansion + +%define REDUCE_CODE_SIZE + +; Comment in/out the following lines to obtain the desired subroutines. These +; selections MUST match those in the C header file aes.h + +; %define AES_128 ; define if AES with 128 bit keys is needed +; %define AES_192 ; define if AES with 192 bit keys is needed +%define AES_256 ; define if AES with 256 bit keys is needed +; %define AES_VAR ; define if a variable key size is needed +%define ENCRYPTION ; define if encryption is needed +%define DECRYPTION ; define if decryption is needed +; %define AES_REV_DKS ; define if key decryption schedule is reversed + +%ifndef ASM_X86_V2C +%define ENCRYPTION_KEY_SCHEDULE ; define if encryption key expansion is needed +%define DECRYPTION_KEY_SCHEDULE ; define if decryption key expansion is needed +%endif + +; The encryption key schedule has the following in memory layout where N is the +; number of rounds (10, 12 or 14): +; +; lo: | input key (round 0) | ; each round is four 32-bit words +; | encryption round 1 | +; | encryption round 2 | +; .... +; | encryption round N-1 | +; hi: | encryption round N | +; +; The decryption key schedule is normally set up so that it has the same +; layout as above by actually reversing the order of the encryption key +; schedule in memory (this happens when AES_REV_DKS is set): +; +; lo: | decryption round 0 | = | encryption round N | +; | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] +; | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] +; .... .... +; | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] +; hi: | decryption round N | = | input key (round 0) | +; +; with rounds except the first and last modified using inv_mix_column() +; But if AES_REV_DKS is NOT set the order of keys is left as it is for +; encryption so that it has to be accessed in reverse when used for +; decryption (although the inverse mix column modifications are done) +; +; lo: | decryption round 0 | = | input key (round 0) | +; | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] +; | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] +; .... .... +; | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] +; hi: | decryption round N | = | encryption round N | +; +; This layout is faster when the assembler key scheduling provided here +; is used. +; +; End of user defines + +%ifdef AES_VAR +%ifndef AES_128 +%define AES_128 +%endif +%ifndef AES_192 +%define AES_192 +%endif +%ifndef AES_256 +%define AES_256 +%endif +%endif + +%ifdef AES_VAR +%define KS_LENGTH 60 +%elifdef AES_256 +%define KS_LENGTH 60 +%elifdef AES_192 +%define KS_LENGTH 52 +%else +%define KS_LENGTH 44 +%endif + +; These macros implement stack based local variables + +%macro save 2 + mov [esp+4*%1],%2 +%endmacro + +%macro restore 2 + mov %1,[esp+4*%2] +%endmacro + +%ifdef REDUCE_CODE_SIZE + %macro mf_call 1 + call %1 + %endmacro +%else + %macro mf_call 1 + %1 + %endmacro +%endif + +; the DLL has to implement the _stdcall calling interface on return +; In this case we have to take our parameters (3 4-byte pointers) +; off the stack + +%define parms 12 + +%macro do_name 1-2 parms +%ifndef DLL_EXPORT + global %1 +%1: +%else + global %1@%2 + export %1@%2 +%1@%2: +%endif +%endmacro + +%macro do_call 1-2 parms +%ifndef DLL_EXPORT + call %1 + add esp,%2 +%else + call %1@%2 +%endif +%endmacro + +%macro do_exit 0-1 parms +%ifdef DLL_EXPORT + ret %1 +%else + ret +%endif +%endmacro + +; finite field multiplies by {02}, {04} and {08} + +%define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) +%define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) +%define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) + +; finite field multiplies required in table generation + +%define f3(x) (f2(x) ^ x) +%define f9(x) (f8(x) ^ x) +%define fb(x) (f8(x) ^ f2(x) ^ x) +%define fd(x) (f8(x) ^ f4(x) ^ x) +%define fe(x) (f8(x) ^ f4(x) ^ f2(x)) + +%define etab_0(x) [_aes_enc_tab+4+8*x] +%define etab_1(x) [_aes_enc_tab+3+8*x] +%define etab_2(x) [_aes_enc_tab+2+8*x] +%define etab_3(x) [_aes_enc_tab+1+8*x] +%define etab_b(x) byte [_aes_enc_tab+1+8*x] ; used with movzx for 0x000000xx +%define etab_w(x) word [_aes_enc_tab+8*x] ; used with movzx for 0x0000xx00 + +%define btab_0(x) [_aes_enc_tab+6+8*x] +%define btab_1(x) [_aes_enc_tab+5+8*x] +%define btab_2(x) [_aes_enc_tab+4+8*x] +%define btab_3(x) [_aes_enc_tab+3+8*x] + +; ROUND FUNCTION. Build column[2] on ESI and column[3] on EDI that have the +; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX. +; +; Input: +; +; EAX column[0] +; EBX column[1] +; ECX column[2] +; EDX column[3] +; ESI column key[round][2] +; EDI column key[round][3] +; EBP scratch +; +; Output: +; +; EBP column[0] unkeyed +; EBX column[1] unkeyed +; ESI column[2] keyed +; EDI column[3] keyed +; EAX scratch +; ECX scratch +; EDX scratch + +%macro rnd_fun 2 + + rol ebx,16 + %1 esi, cl, 0, ebp + %1 esi, dh, 1, ebp + %1 esi, bh, 3, ebp + %1 edi, dl, 0, ebp + %1 edi, ah, 1, ebp + %1 edi, bl, 2, ebp + %2 ebp, al, 0, ebp + shr ebx,16 + and eax,0xffff0000 + or eax,ebx + shr edx,16 + %1 ebp, ah, 1, ebx + %1 ebp, dh, 3, ebx + %2 ebx, dl, 2, ebx + %1 ebx, ch, 1, edx + %1 ebx, al, 0, edx + shr eax,16 + shr ecx,16 + %1 ebp, cl, 2, edx + %1 edi, ch, 3, edx + %1 esi, al, 2, edx + %1 ebx, ah, 3, edx + +%endmacro + +; Basic MOV and XOR Operations for normal rounds + +%macro nr_xor 4 + movzx %4,%2 + xor %1,etab_%3(%4) +%endmacro + +%macro nr_mov 4 + movzx %4,%2 + mov %1,etab_%3(%4) +%endmacro + +; Basic MOV and XOR Operations for last round + +%if 1 + + %macro lr_xor 4 + movzx %4,%2 + movzx %4,etab_b(%4) + %if %3 != 0 + shl %4,8*%3 + %endif + xor %1,%4 + %endmacro + + %macro lr_mov 4 + movzx %4,%2 + movzx %1,etab_b(%4) + %if %3 != 0 + shl %1,8*%3 + %endif + %endmacro + +%else ; less effective but worth leaving as an option + + %macro lr_xor 4 + movzx %4,%2 + mov %4,btab_%3(%4) + and %4,0x000000ff << 8 * %3 + xor %1,%4 + %endmacro + + %macro lr_mov 4 + movzx %4,%2 + mov %1,btab_%3(%4) + and %1,0x000000ff << 8 * %3 + %endmacro + +%endif + +; Apply S-Box to the 4 bytes in a 32-bit word and rotate byte positions + +%ifdef REDUCE_CODE_SIZE + +l3s_col: + movzx ecx,al ; in eax + movzx ecx, etab_b(ecx) ; out eax + xor edx,ecx ; scratch ecx,edx + movzx ecx,ah + movzx ecx, etab_b(ecx) + shl ecx,8 + xor edx,ecx + shr eax,16 + movzx ecx,al + movzx ecx, etab_b(ecx) + shl ecx,16 + xor edx,ecx + movzx ecx,ah + movzx ecx, etab_b(ecx) + shl ecx,24 + xor edx,ecx + mov eax,edx + ret + +%else + +%macro l3s_col 0 + + movzx ecx,al ; in eax + movzx ecx, etab_b(ecx) ; out eax + xor edx,ecx ; scratch ecx,edx + movzx ecx,ah + movzx ecx, etab_b(ecx) + shl ecx,8 + xor edx,ecx + shr eax,16 + movzx ecx,al + movzx ecx, etab_b(ecx) + shl ecx,16 + xor edx,ecx + movzx ecx,ah + movzx ecx, etab_b(ecx) + shl ecx,24 + xor edx,ecx + mov eax,edx + +%endmacro + +%endif + +; offsets to parameters + +in_blk equ 2 ; input byte array address parameter +out_blk equ 4 ; output byte array address parameter +ctx equ 6 ; AES context structure +stk_spc equ 20 ; stack space + +%ifdef ENCRYPTION + +; %define ENCRYPTION_TABLE + +%ifdef REDUCE_CODE_SIZE + +enc_round: + sub sp, 2 + add ebp,16 + save 1,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + rnd_fun nr_xor, nr_mov + + mov eax,ebp + mov ecx,esi + mov edx,edi + restore ebp,1 + xor eax,[ebp] + xor ebx,[ebp+4] + add sp, 2 + ret + +%else + +%macro enc_round 0 + + add ebp,16 + save 0,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + rnd_fun nr_xor, nr_mov + + mov eax,ebp + mov ecx,esi + mov edx,edi + restore ebp,0 + xor eax,[ebp] + xor ebx,[ebp+4] + +%endmacro + +%endif + +%macro enc_last_round 0 + + add ebp,16 + save 0,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + rnd_fun lr_xor, lr_mov + + mov eax,ebp + restore ebp,0 + xor eax,[ebp] + xor ebx,[ebp+4] + +%endmacro + + section _TEXT + +; AES Encryption Subroutine + + do_name _aes_encrypt,12 + + mov ax, sp + movzx esp, ax + + sub esp,stk_spc + mov [esp+16],ebp + mov [esp+12],ebx + mov [esp+ 8],esi + mov [esp+ 4],edi + + movzx esi,word [esp+in_blk+stk_spc] ; input pointer + mov eax,[esi ] + mov ebx,[esi+ 4] + mov ecx,[esi+ 8] + mov edx,[esi+12] + + movzx ebp,word [esp+ctx+stk_spc] ; key pointer + movzx edi,byte [ebp+4*KS_LENGTH] + xor eax,[ebp ] + xor ebx,[ebp+ 4] + xor ecx,[ebp+ 8] + xor edx,[ebp+12] + +; determine the number of rounds + +%ifndef AES_256 + cmp edi,10*16 + je .3 + cmp edi,12*16 + je .2 + cmp edi,14*16 + je .1 + mov eax,-1 + jmp .5 +%endif + +.1: mf_call enc_round + mf_call enc_round +.2: mf_call enc_round + mf_call enc_round +.3: mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + mf_call enc_round + enc_last_round + + movzx edx,word [esp+out_blk+stk_spc] + mov [edx],eax + mov [edx+4],ebx + mov [edx+8],esi + mov [edx+12],edi + xor eax,eax + +.5: mov ebp,[esp+16] + mov ebx,[esp+12] + mov esi,[esp+ 8] + mov edi,[esp+ 4] + add esp,stk_spc + do_exit 12 + +%endif + +%macro f_key 2 + + push ecx + push edx + mov edx,esi + ror eax,8 + mf_call l3s_col + mov esi,eax + pop edx + pop ecx + xor esi,rc_val + + mov [ebp+%1*%2],esi + xor edi,esi + mov [ebp+%1*%2+4],edi + xor ecx,edi + mov [ebp+%1*%2+8],ecx + xor edx,ecx + mov [ebp+%1*%2+12],edx + mov eax,edx + +%if %2 == 24 + +%if %1 < 7 + xor eax,[ebp+%1*%2+16-%2] + mov [ebp+%1*%2+16],eax + xor eax,[ebp+%1*%2+20-%2] + mov [ebp+%1*%2+20],eax +%endif + +%elif %2 == 32 + +%if %1 < 6 + push ecx + push edx + mov edx,[ebp+%1*%2+16-%2] + mf_call l3s_col + pop edx + pop ecx + mov [ebp+%1*%2+16],eax + xor eax,[ebp+%1*%2+20-%2] + mov [ebp+%1*%2+20],eax + xor eax,[ebp+%1*%2+24-%2] + mov [ebp+%1*%2+24],eax + xor eax,[ebp+%1*%2+28-%2] + mov [ebp+%1*%2+28],eax +%endif + +%endif + +%assign rc_val f2(rc_val) + +%endmacro + +%ifdef ENCRYPTION_KEY_SCHEDULE + +%ifdef AES_128 + +%ifndef ENCRYPTION_TABLE +; %define ENCRYPTION_TABLE +%endif + +%assign rc_val 1 + + do_name _aes_encrypt_key128,8 + + push ebp + push ebx + push esi + push edi + + mov ebp,[esp+24] + mov [ebp+4*KS_LENGTH],dword 10*16 + mov ebx,[esp+20] + + mov esi,[ebx] + mov [ebp],esi + mov edi,[ebx+4] + mov [ebp+4],edi + mov ecx,[ebx+8] + mov [ebp+8],ecx + mov edx,[ebx+12] + mov [ebp+12],edx + add ebp,16 + mov eax,edx + + f_key 0,16 ; 11 * 4 = 44 unsigned longs + f_key 1,16 ; 4 + 4 * 10 generated = 44 + f_key 2,16 + f_key 3,16 + f_key 4,16 + f_key 5,16 + f_key 6,16 + f_key 7,16 + f_key 8,16 + f_key 9,16 + + pop edi + pop esi + pop ebx + pop ebp + xor eax,eax + do_exit 8 + +%endif + +%ifdef AES_192 + +%ifndef ENCRYPTION_TABLE +; %define ENCRYPTION_TABLE +%endif + +%assign rc_val 1 + + do_name _aes_encrypt_key192,8 + + push ebp + push ebx + push esi + push edi + + mov ebp,[esp+24] + mov [ebp+4*KS_LENGTH],dword 12 * 16 + mov ebx,[esp+20] + + mov esi,[ebx] + mov [ebp],esi + mov edi,[ebx+4] + mov [ebp+4],edi + mov ecx,[ebx+8] + mov [ebp+8],ecx + mov edx,[ebx+12] + mov [ebp+12],edx + mov eax,[ebx+16] + mov [ebp+16],eax + mov eax,[ebx+20] + mov [ebp+20],eax + add ebp,24 + + f_key 0,24 ; 13 * 4 = 52 unsigned longs + f_key 1,24 ; 6 + 6 * 8 generated = 54 + f_key 2,24 + f_key 3,24 + f_key 4,24 + f_key 5,24 + f_key 6,24 + f_key 7,24 + + pop edi + pop esi + pop ebx + pop ebp + xor eax,eax + do_exit 8 + +%endif + +%ifdef AES_256 + +%ifndef ENCRYPTION_TABLE +; %define ENCRYPTION_TABLE +%endif + +%assign rc_val 1 + + do_name _aes_encrypt_key256,8 + + mov ax, sp + movzx esp, ax + + push ebp + push ebx + push esi + push edi + + movzx ebp, word [esp+20] ; ks + mov [ebp+4*KS_LENGTH],dword 14 * 16 + movzx ebx, word [esp+18] ; key + + mov esi,[ebx] + mov [ebp],esi + mov edi,[ebx+4] + mov [ebp+4],edi + mov ecx,[ebx+8] + mov [ebp+8],ecx + mov edx,[ebx+12] + mov [ebp+12],edx + mov eax,[ebx+16] + mov [ebp+16],eax + mov eax,[ebx+20] + mov [ebp+20],eax + mov eax,[ebx+24] + mov [ebp+24],eax + mov eax,[ebx+28] + mov [ebp+28],eax + add ebp,32 + + f_key 0,32 ; 15 * 4 = 60 unsigned longs + f_key 1,32 ; 8 + 8 * 7 generated = 64 + f_key 2,32 + f_key 3,32 + f_key 4,32 + f_key 5,32 + f_key 6,32 + + pop edi + pop esi + pop ebx + pop ebp + xor eax,eax + do_exit 8 + +%endif + +%ifdef AES_VAR + +%ifndef ENCRYPTION_TABLE +; %define ENCRYPTION_TABLE +%endif + + do_name _aes_encrypt_key,12 + + mov ecx,[esp+4] + mov eax,[esp+8] + mov edx,[esp+12] + push edx + push ecx + + cmp eax,16 + je .1 + cmp eax,128 + je .1 + + cmp eax,24 + je .2 + cmp eax,192 + je .2 + + cmp eax,32 + je .3 + cmp eax,256 + je .3 + mov eax,-1 + add esp,8 + do_exit 12 + +.1: do_call _aes_encrypt_key128,8 + do_exit 12 +.2: do_call _aes_encrypt_key192,8 + do_exit 12 +.3: do_call _aes_encrypt_key256,8 + do_exit 12 + +%endif + +%endif + +%ifdef ENCRYPTION_TABLE + +; S-box data - 256 entries + + section _DATA + +%define u8(x) 0, x, x, f3(x), f2(x), x, x, f3(x) + +_aes_enc_tab: + db u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5) + db u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76) + db u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0) + db u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0) + db u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc) + db u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15) + db u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a) + db u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75) + db u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0) + db u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84) + db u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b) + db u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf) + db u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85) + db u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8) + db u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5) + db u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2) + db u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17) + db u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73) + db u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88) + db u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb) + db u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c) + db u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79) + db u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9) + db u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08) + db u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6) + db u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a) + db u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e) + db u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e) + db u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94) + db u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf) + db u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68) + db u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16) + +%endif + +%ifdef DECRYPTION + +; %define DECRYPTION_TABLE + +%define dtab_0(x) [_aes_dec_tab+ 8*x] +%define dtab_1(x) [_aes_dec_tab+3+8*x] +%define dtab_2(x) [_aes_dec_tab+2+8*x] +%define dtab_3(x) [_aes_dec_tab+1+8*x] +%define dtab_x(x) byte [_aes_dec_tab+7+8*x] + +%macro irn_fun 2 + + rol eax,16 + %1 esi, cl, 0, ebp + %1 esi, bh, 1, ebp + %1 esi, al, 2, ebp + %1 edi, dl, 0, ebp + %1 edi, ch, 1, ebp + %1 edi, ah, 3, ebp + %2 ebp, bl, 0, ebp + shr eax,16 + and ebx,0xffff0000 + or ebx,eax + shr ecx,16 + %1 ebp, bh, 1, eax + %1 ebp, ch, 3, eax + %2 eax, cl, 2, ecx + %1 eax, bl, 0, ecx + %1 eax, dh, 1, ecx + shr ebx,16 + shr edx,16 + %1 esi, dh, 3, ecx + %1 ebp, dl, 2, ecx + %1 eax, bh, 3, ecx + %1 edi, bl, 2, ecx + +%endmacro + +; Basic MOV and XOR Operations for normal rounds + +%macro ni_xor 4 + movzx %4,%2 + xor %1,dtab_%3(%4) +%endmacro + +%macro ni_mov 4 + movzx %4,%2 + mov %1,dtab_%3(%4) +%endmacro + +; Basic MOV and XOR Operations for last round + +%macro li_xor 4 + movzx %4,%2 + movzx %4,dtab_x(%4) +%if %3 != 0 + shl %4,8*%3 +%endif + xor %1,%4 +%endmacro + +%macro li_mov 4 + movzx %4,%2 + movzx %1,dtab_x(%4) +%if %3 != 0 + shl %1,8*%3 +%endif +%endmacro + +%ifdef REDUCE_CODE_SIZE + +dec_round: + sub sp, 2 +%ifdef AES_REV_DKS + add ebp,16 +%else + sub ebp,16 +%endif + save 1,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + irn_fun ni_xor, ni_mov + + mov ebx,ebp + mov ecx,esi + mov edx,edi + restore ebp,1 + xor eax,[ebp] + xor ebx,[ebp+4] + add sp, 2 + ret + +%else + +%macro dec_round 0 + +%ifdef AES_REV_DKS + add ebp,16 +%else + sub ebp,16 +%endif + save 0,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + irn_fun ni_xor, ni_mov + + mov ebx,ebp + mov ecx,esi + mov edx,edi + restore ebp,0 + xor eax,[ebp] + xor ebx,[ebp+4] + +%endmacro + +%endif + +%macro dec_last_round 0 + +%ifdef AES_REV_DKS + add ebp,16 +%else + sub ebp,16 +%endif + save 0,ebp + mov esi,[ebp+8] + mov edi,[ebp+12] + + irn_fun li_xor, li_mov + + mov ebx,ebp + restore ebp,0 + xor eax,[ebp] + xor ebx,[ebp+4] + +%endmacro + + section _TEXT + +; AES Decryption Subroutine + + do_name _aes_decrypt,12 + + mov ax, sp + movzx esp, ax + + sub esp,stk_spc + mov [esp+16],ebp + mov [esp+12],ebx + mov [esp+ 8],esi + mov [esp+ 4],edi + +; input four columns and xor in first round key + + movzx esi,word [esp+in_blk+stk_spc] ; input pointer + mov eax,[esi ] + mov ebx,[esi+ 4] + mov ecx,[esi+ 8] + mov edx,[esi+12] + lea esi,[esi+16] + + movzx ebp, word [esp+ctx+stk_spc] ; key pointer + movzx edi,byte[ebp+4*KS_LENGTH] +%ifndef AES_REV_DKS ; if decryption key schedule is not reversed + lea ebp,[ebp+edi] ; we have to access it from the top down +%endif + xor eax,[ebp ] ; key schedule + xor ebx,[ebp+ 4] + xor ecx,[ebp+ 8] + xor edx,[ebp+12] + +; determine the number of rounds + +%ifndef AES_256 + cmp edi,10*16 + je .3 + cmp edi,12*16 + je .2 + cmp edi,14*16 + je .1 + mov eax,-1 + jmp .5 +%endif + +.1: mf_call dec_round + mf_call dec_round +.2: mf_call dec_round + mf_call dec_round +.3: mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + mf_call dec_round + dec_last_round + +; move final values to the output array. + + movzx ebp,word [esp+out_blk+stk_spc] + mov [ebp],eax + mov [ebp+4],ebx + mov [ebp+8],esi + mov [ebp+12],edi + xor eax,eax + +.5: mov ebp,[esp+16] + mov ebx,[esp+12] + mov esi,[esp+ 8] + mov edi,[esp+ 4] + add esp,stk_spc + do_exit 12 + +%endif + +%ifdef REDUCE_CODE_SIZE + +inv_mix_col: + movzx ecx,dl ; input eax, edx + movzx ecx,etab_b(ecx) ; output eax + mov eax,dtab_0(ecx) ; used ecx + movzx ecx,dh + shr edx,16 + movzx ecx,etab_b(ecx) + xor eax,dtab_1(ecx) + movzx ecx,dl + movzx ecx,etab_b(ecx) + xor eax,dtab_2(ecx) + movzx ecx,dh + movzx ecx,etab_b(ecx) + xor eax,dtab_3(ecx) + ret + +%else + +%macro inv_mix_col 0 + + movzx ecx,dl ; input eax, edx + movzx ecx,etab_b(ecx) ; output eax + mov eax,dtab_0(ecx) ; used ecx + movzx ecx,dh + shr edx,16 + movzx ecx,etab_b(ecx) + xor eax,dtab_1(ecx) + movzx ecx,dl + movzx ecx,etab_b(ecx) + xor eax,dtab_2(ecx) + movzx ecx,dh + movzx ecx,etab_b(ecx) + xor eax,dtab_3(ecx) + +%endmacro + +%endif + +%ifdef DECRYPTION_KEY_SCHEDULE + +%ifdef AES_128 + +%ifndef DECRYPTION_TABLE +; %define DECRYPTION_TABLE +%endif + + do_name _aes_decrypt_key128,8 + + push ebp + push ebx + push esi + push edi + mov eax,[esp+24] ; context + mov edx,[esp+20] ; key + push eax + push edx + do_call _aes_encrypt_key128,8 ; generate expanded encryption key + mov eax,10*16 + mov esi,[esp+24] ; pointer to first round key + lea edi,[esi+eax] ; pointer to last round key + add esi,32 + ; the inverse mix column transformation + mov edx,[esi-16] ; needs to be applied to all round keys + mf_call inv_mix_col ; except first and last. Hence start by + mov [esi-16],eax ; transforming the four sub-keys in the + mov edx,[esi-12] ; second round key + mf_call inv_mix_col + mov [esi-12],eax ; transformations for subsequent rounds + mov edx,[esi-8] ; can then be made more efficient by + mf_call inv_mix_col ; noting that for three of the four sub-keys + mov [esi-8],eax ; in the encryption round key ek[r]: + mov edx,[esi-4] ; + mf_call inv_mix_col ; ek[r][n] = ek[r][n-1] ^ ek[r-1][n] + mov [esi-4],eax ; + ; where n is 1..3. Hence the corresponding +.0: mov edx,[esi] ; subkeys in the decryption round key dk[r] + mf_call inv_mix_col ; also obey since inv_mix_col is linear in + mov [esi],eax ; GF(256): + xor eax,[esi-12] ; + mov [esi+4],eax ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n] + xor eax,[esi-8] ; + mov [esi+8],eax ; So we only need one inverse mix column + xor eax,[esi-4] ; operation (n = 0) for each four word cycle + mov [esi+12],eax ; in the expanded key. + add esi,16 + cmp edi,esi + jg .0 + jmp dec_end + +%endif + +%ifdef AES_192 + +%ifndef DECRYPTION_TABLE +; %define DECRYPTION_TABLE +%endif + + do_name _aes_decrypt_key192,8 + + push ebp + push ebx + push esi + push edi + mov eax,[esp+24] ; context + mov edx,[esp+20] ; key + push eax + push edx + do_call _aes_encrypt_key192,8 ; generate expanded encryption key + mov eax,12*16 + mov esi,[esp+24] ; first round key + lea edi,[esi+eax] ; last round key + add esi,48 ; the first 6 words are the key, of + ; which the top 2 words are part of + mov edx,[esi-32] ; the second round key and hence + mf_call inv_mix_col ; need to be modified. After this we + mov [esi-32],eax ; need to do a further six values prior + mov edx,[esi-28] ; to using a more efficient technique + mf_call inv_mix_col ; based on: + mov [esi-28],eax ; + ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n] + mov edx,[esi-24] ; + mf_call inv_mix_col ; for n = 1 .. 5 where the key expansion + mov [esi-24],eax ; cycle is now 6 words long + mov edx,[esi-20] + mf_call inv_mix_col + mov [esi-20],eax + mov edx,[esi-16] + mf_call inv_mix_col + mov [esi-16],eax + mov edx,[esi-12] + mf_call inv_mix_col + mov [esi-12],eax + mov edx,[esi-8] + mf_call inv_mix_col + mov [esi-8],eax + mov edx,[esi-4] + mf_call inv_mix_col + mov [esi-4],eax + +.0: mov edx,[esi] ; the expanded key is 13 * 4 = 44 32-bit words + mf_call inv_mix_col ; of which 11 * 4 = 44 have to be modified + mov [esi],eax ; using inv_mix_col. We have already done 8 + xor eax,[esi-20] ; of these so 36 are left - hence we need + mov [esi+4],eax ; exactly 6 loops of six here + xor eax,[esi-16] + mov [esi+8],eax + xor eax,[esi-12] + mov [esi+12],eax + xor eax,[esi-8] + mov [esi+16],eax + xor eax,[esi-4] + mov [esi+20],eax + add esi,24 + cmp edi,esi + jg .0 + jmp dec_end + +%endif + +%ifdef AES_256 + +%ifndef DECRYPTION_TABLE +; %define DECRYPTION_TABLE +%endif + + do_name _aes_decrypt_key256,8 + + mov ax, sp + movzx esp, ax + push ebp + push ebx + push esi + push edi + + movzx eax, word [esp+20] ; ks + movzx edx, word [esp+18] ; key + push ax + push dx + do_call _aes_encrypt_key256,4 ; generate expanded encryption key + mov eax,14*16 + movzx esi, word [esp+20] ; ks + lea edi,[esi+eax] + add esi,64 + + mov edx,[esi-48] ; the primary key is 8 words, of which + mf_call inv_mix_col ; the top four require modification + mov [esi-48],eax + mov edx,[esi-44] + mf_call inv_mix_col + mov [esi-44],eax + mov edx,[esi-40] + mf_call inv_mix_col + mov [esi-40],eax + mov edx,[esi-36] + mf_call inv_mix_col + mov [esi-36],eax + + mov edx,[esi-32] ; the encryption key expansion cycle is + mf_call inv_mix_col ; now eight words long so we need to + mov [esi-32],eax ; start by doing one complete block + mov edx,[esi-28] + mf_call inv_mix_col + mov [esi-28],eax + mov edx,[esi-24] + mf_call inv_mix_col + mov [esi-24],eax + mov edx,[esi-20] + mf_call inv_mix_col + mov [esi-20],eax + mov edx,[esi-16] + mf_call inv_mix_col + mov [esi-16],eax + mov edx,[esi-12] + mf_call inv_mix_col + mov [esi-12],eax + mov edx,[esi-8] + mf_call inv_mix_col + mov [esi-8],eax + mov edx,[esi-4] + mf_call inv_mix_col + mov [esi-4],eax + +.0: mov edx,[esi] ; we can now speed up the remaining + mf_call inv_mix_col ; rounds by using the technique + mov [esi],eax ; outlined earlier. But note that + xor eax,[esi-28] ; there is one extra inverse mix + mov [esi+4],eax ; column operation as the 256 bit + xor eax,[esi-24] ; key has an extra non-linear step + mov [esi+8],eax ; for the midway element. + xor eax,[esi-20] + mov [esi+12],eax ; the expanded key is 15 * 4 = 60 + mov edx,[esi+16] ; 32-bit words of which 52 need to + mf_call inv_mix_col ; be modified. We have already done + mov [esi+16],eax ; 12 so 40 are left - which means + xor eax,[esi-12] ; that we need exactly 5 loops of 8 + mov [esi+20],eax + xor eax,[esi-8] + mov [esi+24],eax + xor eax,[esi-4] + mov [esi+28],eax + add esi,32 + cmp edi,esi + jg .0 + +%endif + +dec_end: + +%ifdef AES_REV_DKS + + movzx esi,word [esp+20] ; this reverses the order of the +.1: mov eax,[esi] ; round keys if required + mov ebx,[esi+4] + mov ebp,[edi] + mov edx,[edi+4] + mov [esi],ebp + mov [esi+4],edx + mov [edi],eax + mov [edi+4],ebx + + mov eax,[esi+8] + mov ebx,[esi+12] + mov ebp,[edi+8] + mov edx,[edi+12] + mov [esi+8],ebp + mov [esi+12],edx + mov [edi+8],eax + mov [edi+12],ebx + + add esi,16 + sub edi,16 + cmp edi,esi + jg .1 + +%endif + + pop edi + pop esi + pop ebx + pop ebp + xor eax,eax + do_exit 8 + +%ifdef AES_VAR + + do_name _aes_decrypt_key,12 + + mov ecx,[esp+4] + mov eax,[esp+8] + mov edx,[esp+12] + push edx + push ecx + + cmp eax,16 + je .1 + cmp eax,128 + je .1 + + cmp eax,24 + je .2 + cmp eax,192 + je .2 + + cmp eax,32 + je .3 + cmp eax,256 + je .3 + mov eax,-1 + add esp,8 + do_exit 12 + +.1: do_call _aes_decrypt_key128,8 + do_exit 12 +.2: do_call _aes_decrypt_key192,8 + do_exit 12 +.3: do_call _aes_decrypt_key256,8 + do_exit 12 + +%endif + +%endif + +%ifdef DECRYPTION_TABLE + +; Inverse S-box data - 256 entries + + section _DATA + +%define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x + +_aes_dec_tab: + db v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38) + db v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb) + db v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87) + db v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb) + db v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d) + db v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e) + db v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2) + db v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25) + db v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16) + db v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92) + db v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda) + db v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84) + db v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a) + db v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06) + db v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02) + db v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b) + db v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea) + db v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73) + db v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85) + db v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e) + db v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89) + db v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b) + db v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20) + db v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4) + db v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31) + db v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f) + db v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d) + db v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef) + db v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0) + db v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61) + db v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26) + db v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d) + +%endif |