; ---------------------------------------------------------------------------
; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
;
; LICENSE TERMS
;
; The free distribution and use of this software is allowed (with or without
; changes) provided that:
;
;  1. source code distributions include the above copyright notice, this
;     list of conditions and the following disclaimer;
;
;  2. binary distributions include the above copyright notice, this list
;     of conditions and the following disclaimer in their documentation;
;
;  3. the name of the copyright holder is not used to endorse products
;     built using this software without specific written permission.
;
; DISCLAIMER
;
; This software is provided 'as is' with no explicit or implied warranties
; in respect of its properties, including, but not limited to, correctness
; and/or fitness for purpose.
; ---------------------------------------------------------------------------
; Issue 20/12/2007
;
; This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h
; and the same define to be set here as well. If AES_V2C is set this file
; requires the C files aeskey.c and aestab.c for support.

; An AES implementation for x86 processors using the YASM (or NASM) assembler.
; This is a full assembler implementation covering encryption, decryption and
; key scheduling. It uses 2k bytes of tables but its encryption and decryption
; performance is very close to that obtained using large tables.  Key schedule
; expansion is slower for both encryption and decryption but this is likely to
; be offset by the much smaller load that this version places on the processor
; cache. I acknowledge the contribution made by Daniel Bernstein to aspects of
; the design of the AES round function used here.
;
; This code provides the standard AES block size (128 bits, 16 bytes) and the
; three standard AES key sizes (128, 192 and 256 bits). It has the same call
; interface as my C implementation. The ebx, esi, edi and ebp registers are
; preserved across calls but eax, ecx and edx and the artihmetic status flags
; are not.  Although this is a full assembler implementation, it can be used
; in conjunction with my C code which provides faster key scheduling using
; large tables. In this case aeskey.c should be compiled with ASM_X86_V2C
; defined.  It is also important that the defines below match those used in the
; C code.  This code uses the VC++ register saving conentions; if it is used
; with another compiler, conventions for using and saving registers may need
; to be checked (and calling conventions).  The YASM command line for the VC++
; custom build step is:
;
;    yasm -Xvc -f win32 -D <Z> -o "$(TargetDir)\$(InputName).obj" "$(InputPath)"
;
; For the cryptlib build this is (pcg):
;
;	yasm -Xvc -f win32 -D ASM_X86_V2C -o aescrypt2.obj aes_x86_v2.asm
;
; where <Z> is ASM_X86_V2 or ASM_X86_V2C.  The calling intefaces are:
;
;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
;
;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
;
;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
;                                            const aes_encrypt_ctx cx[1]);
;
;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
;                                            const aes_decrypt_ctx cx[1]);
;
;     AES_RETURN aes_encrypt_key(const unsigned char key[],
;                           unsigned int len, const aes_decrypt_ctx cx[1]);
;
;     AES_RETURN aes_decrypt_key(const unsigned char key[],
;                           unsigned int len, const aes_decrypt_ctx cx[1]);
;
; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
; either bits or bytes.

; The DLL interface must use the _stdcall convention in which the number
; of bytes of parameter space is added after an @ to the sutine's name.
; We must also remove our parameters from the stack before return (see
; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.

;
; Adapted for TrueCrypt:
; - All tables generated at run-time
; - Adapted for 16-bit environment
;

CPU 386
USE16
SEGMENT _TEXT PUBLIC CLASS=CODE USE16
SEGMENT _DATA PUBLIC CLASS=DATA USE16

GROUP DGROUP _TEXT _DATA

extern _aes_dec_tab		; Aestab.c
extern _aes_enc_tab

; %define DLL_EXPORT

; The size of the code can be reduced by using functions for the encryption
; and decryption rounds in place of macro expansion

%define REDUCE_CODE_SIZE

; Comment in/out the following lines to obtain the desired subroutines. These
; selections MUST match those in the C header file aes.h

; %define AES_128                 ; define if AES with 128 bit keys is needed
; %define AES_192                 ; define if AES with 192 bit keys is needed
%define AES_256                 ; define if AES with 256 bit keys is needed
; %define AES_VAR                 ; define if a variable key size is needed
%define ENCRYPTION              ; define if encryption is needed
%define DECRYPTION              ; define if decryption is needed
; %define AES_REV_DKS             ; define if key decryption schedule is reversed

%ifndef ASM_X86_V2C
%define ENCRYPTION_KEY_SCHEDULE ; define if encryption key expansion is needed
%define DECRYPTION_KEY_SCHEDULE ; define if decryption key expansion is needed
%endif

; The encryption key schedule has the following in memory layout where N is the
; number of rounds (10, 12 or 14):
;
; lo: | input key (round 0)  |  ; each round is four 32-bit words
;     | encryption round 1   |
;     | encryption round 2   |
;     ....
;     | encryption round N-1 |
; hi: | encryption round N   |
;
; The decryption key schedule is normally set up so that it has the same
; layout as above by actually reversing the order of the encryption key
; schedule in memory (this happens when AES_REV_DKS is set):
;
; lo: | decryption round 0   | =              | encryption round N   |
;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
;     ....                       ....
;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
; hi: | decryption round N   | =              | input key (round 0)  |
;
; with rounds except the first and last modified using inv_mix_column()
; But if AES_REV_DKS is NOT set the order of keys is left as it is for
; encryption so that it has to be accessed in reverse when used for
; decryption (although the inverse mix column modifications are done)
;
; lo: | decryption round 0   | =              | input key (round 0)  |
;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
;     ....                       ....
;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
; hi: | decryption round N   | =              | encryption round N   |
;
; This layout is faster when the assembler key scheduling provided here
; is used.
;
; End of user defines

%ifdef AES_VAR
%ifndef AES_128
%define AES_128
%endif
%ifndef AES_192
%define AES_192
%endif
%ifndef AES_256
%define AES_256
%endif
%endif

%ifdef AES_VAR
%define KS_LENGTH       60
%elifdef AES_256
%define KS_LENGTH       60
%elifdef AES_192
%define KS_LENGTH       52
%else
%define KS_LENGTH       44
%endif

; These macros implement stack based local variables

%macro  save 2
    mov     [esp+4*%1],%2
%endmacro

%macro  restore 2
    mov     %1,[esp+4*%2]
%endmacro

%ifdef  REDUCE_CODE_SIZE
    %macro mf_call 1
        call %1
    %endmacro
%else
    %macro mf_call 1
        %1
    %endmacro
%endif

; the DLL has to implement the _stdcall calling interface on return
; In this case we have to take our parameters (3 4-byte pointers)
; off the stack

%define parms 12

%macro  do_name 1-2 parms
%ifndef DLL_EXPORT
    global  %1
%1:
%else
    global  %1@%2
    export  %1@%2
%1@%2:
%endif
%endmacro

%macro  do_call 1-2 parms
%ifndef DLL_EXPORT
    call    %1
    add     esp,%2
%else
    call    %1@%2
%endif
%endmacro

%macro  do_exit  0-1 parms
%ifdef DLL_EXPORT
    ret %1
%else
    ret
%endif
%endmacro

; finite field multiplies by {02}, {04} and {08}

%define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
%define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
%define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))

; finite field multiplies required in table generation

%define f3(x)   (f2(x) ^ x)
%define f9(x)   (f8(x) ^ x)
%define fb(x)   (f8(x) ^ f2(x) ^ x)
%define fd(x)   (f8(x) ^ f4(x) ^ x)
%define fe(x)   (f8(x) ^ f4(x) ^ f2(x))

%define etab_0(x)   [_aes_enc_tab+4+8*x]
%define etab_1(x)   [_aes_enc_tab+3+8*x]
%define etab_2(x)   [_aes_enc_tab+2+8*x]
%define etab_3(x)   [_aes_enc_tab+1+8*x]
%define etab_b(x)   byte [_aes_enc_tab+1+8*x] ; used with movzx for 0x000000xx
%define etab_w(x)   word [_aes_enc_tab+8*x]   ; used with movzx for 0x0000xx00

%define btab_0(x)   [_aes_enc_tab+6+8*x]
%define btab_1(x)   [_aes_enc_tab+5+8*x]
%define btab_2(x)   [_aes_enc_tab+4+8*x]
%define btab_3(x)   [_aes_enc_tab+3+8*x]

; ROUND FUNCTION.  Build column[2] on ESI and column[3] on EDI that have the
; round keys pre-loaded. Build column[0] in EBP and column[1] in EBX.
;
; Input:
;
;   EAX     column[0]
;   EBX     column[1]
;   ECX     column[2]
;   EDX     column[3]
;   ESI     column key[round][2]
;   EDI     column key[round][3]
;   EBP     scratch
;
; Output:
;
;   EBP     column[0]   unkeyed
;   EBX     column[1]   unkeyed
;   ESI     column[2]   keyed
;   EDI     column[3]   keyed
;   EAX     scratch
;   ECX     scratch
;   EDX     scratch

%macro rnd_fun 2

    rol     ebx,16
    %1      esi, cl, 0, ebp
    %1      esi, dh, 1, ebp
    %1      esi, bh, 3, ebp
    %1      edi, dl, 0, ebp
    %1      edi, ah, 1, ebp
    %1      edi, bl, 2, ebp
    %2      ebp, al, 0, ebp
    shr     ebx,16
    and     eax,0xffff0000
    or      eax,ebx
    shr     edx,16
    %1      ebp, ah, 1, ebx
    %1      ebp, dh, 3, ebx
    %2      ebx, dl, 2, ebx
    %1      ebx, ch, 1, edx
    %1      ebx, al, 0, edx
    shr     eax,16
    shr     ecx,16
    %1      ebp, cl, 2, edx
    %1      edi, ch, 3, edx
    %1      esi, al, 2, edx
    %1      ebx, ah, 3, edx

%endmacro

; Basic MOV and XOR Operations for normal rounds

%macro  nr_xor  4
    movzx   %4,%2
    xor     %1,etab_%3(%4)
%endmacro

%macro  nr_mov  4
    movzx   %4,%2
    mov     %1,etab_%3(%4)
%endmacro

; Basic MOV and XOR Operations for last round

%if 1

    %macro  lr_xor  4
        movzx   %4,%2
        movzx   %4,etab_b(%4)
    %if %3 != 0
        shl     %4,8*%3
    %endif
        xor     %1,%4
    %endmacro

    %macro  lr_mov  4
        movzx   %4,%2
        movzx   %1,etab_b(%4)
    %if %3 != 0
        shl     %1,8*%3
    %endif
    %endmacro

%else       ; less effective but worth leaving as an option

    %macro  lr_xor  4
        movzx   %4,%2
        mov     %4,btab_%3(%4)
        and     %4,0x000000ff << 8 * %3
        xor     %1,%4
    %endmacro

    %macro  lr_mov  4
        movzx   %4,%2
        mov     %1,btab_%3(%4)
        and     %1,0x000000ff << 8 * %3
    %endmacro

%endif

; Apply S-Box to the 4 bytes in a 32-bit word and rotate byte positions

%ifdef REDUCE_CODE_SIZE

l3s_col:
    movzx   ecx,al              ; in      eax
    movzx   ecx, etab_b(ecx)    ; out     eax
    xor     edx,ecx             ; scratch ecx,edx
    movzx   ecx,ah
    movzx   ecx, etab_b(ecx)
    shl     ecx,8
    xor     edx,ecx
    shr     eax,16
    movzx   ecx,al
    movzx   ecx, etab_b(ecx)
    shl     ecx,16
    xor     edx,ecx
    movzx   ecx,ah
    movzx   ecx, etab_b(ecx)
    shl     ecx,24
    xor     edx,ecx
    mov     eax,edx
    ret

%else

%macro l3s_col 0

    movzx   ecx,al              ; in      eax
    movzx   ecx, etab_b(ecx)    ; out     eax
    xor     edx,ecx             ; scratch ecx,edx
    movzx   ecx,ah
    movzx   ecx, etab_b(ecx)
    shl     ecx,8
    xor     edx,ecx
    shr     eax,16
    movzx   ecx,al
    movzx   ecx, etab_b(ecx)
    shl     ecx,16
    xor     edx,ecx
    movzx   ecx,ah
    movzx   ecx, etab_b(ecx)
    shl     ecx,24
    xor     edx,ecx
    mov     eax,edx

%endmacro

%endif

; offsets to parameters

in_blk  equ     2   ; input byte array address parameter
out_blk equ     4   ; output byte array address parameter
ctx     equ     6   ; AES context structure
stk_spc equ    20   ; stack space

%ifdef  ENCRYPTION

; %define ENCRYPTION_TABLE

%ifdef REDUCE_CODE_SIZE

enc_round:
	sub		sp, 2
    add     ebp,16
    save    1,ebp
    mov     esi,[ebp+8]
    mov     edi,[ebp+12]

    rnd_fun nr_xor, nr_mov

    mov     eax,ebp
    mov     ecx,esi
    mov     edx,edi
    restore ebp,1
    xor     eax,[ebp]
    xor     ebx,[ebp+4]
	add		sp, 2
    ret

%else

%macro enc_round 0

    add     ebp,16
    save    0,ebp
    mov     esi,[ebp+8]
    mov     edi,[ebp+12]

    rnd_fun nr_xor, nr_mov

    mov     eax,ebp
    mov     ecx,esi
    mov     edx,edi
    restore ebp,0
    xor     eax,[ebp]
    xor     ebx,[ebp+4]

%endmacro

%endif

%macro enc_last_round 0

    add     ebp,16
    save    0,ebp
    mov     esi,[ebp+8]
    mov     edi,[ebp+12]

    rnd_fun lr_xor, lr_mov

    mov     eax,ebp
    restore ebp,0
    xor     eax,[ebp]
    xor     ebx,[ebp+4]

%endmacro

    section _TEXT

; AES Encryption Subroutine

    do_name _aes_encrypt,12

	mov		ax, sp
	movzx	esp, ax

    sub     esp,stk_spc
    mov     [esp+16],ebp
    mov     [esp+12],ebx
    mov     [esp+ 8],esi
    mov     [esp+ 4],edi

    movzx   esi,word [esp+in_blk+stk_spc] ; input pointer
    mov     eax,[esi   ]
    mov     ebx,[esi+ 4]
    mov     ecx,[esi+ 8]
    mov     edx,[esi+12]

    movzx   ebp,word [esp+ctx+stk_spc]    ; key pointer
    movzx   edi,byte [ebp+4*KS_LENGTH]
    xor     eax,[ebp   ]
    xor     ebx,[ebp+ 4]
    xor     ecx,[ebp+ 8]
    xor     edx,[ebp+12]

; determine the number of rounds

%ifndef AES_256
    cmp     edi,10*16
    je      .3
    cmp     edi,12*16
    je      .2
    cmp     edi,14*16
    je      .1
    mov     eax,-1
    jmp     .5
%endif

.1: mf_call enc_round
    mf_call enc_round
.2: mf_call enc_round
    mf_call enc_round
.3: mf_call enc_round
    mf_call enc_round
    mf_call enc_round
    mf_call enc_round
    mf_call enc_round
    mf_call enc_round
    mf_call enc_round
    mf_call enc_round
    mf_call enc_round
    enc_last_round

    movzx   edx,word [esp+out_blk+stk_spc]
    mov     [edx],eax
    mov     [edx+4],ebx
    mov     [edx+8],esi
    mov     [edx+12],edi
    xor     eax,eax

.5: mov     ebp,[esp+16]
    mov     ebx,[esp+12]
    mov     esi,[esp+ 8]
    mov     edi,[esp+ 4]
    add     esp,stk_spc
    do_exit 12

%endif

%macro f_key 2

    push    ecx
    push    edx
    mov     edx,esi
    ror     eax,8
    mf_call l3s_col
    mov     esi,eax
    pop     edx
    pop     ecx
    xor     esi,rc_val

    mov     [ebp+%1*%2],esi
    xor     edi,esi
    mov     [ebp+%1*%2+4],edi
    xor     ecx,edi
    mov     [ebp+%1*%2+8],ecx
    xor     edx,ecx
    mov     [ebp+%1*%2+12],edx
    mov     eax,edx

%if %2 == 24

%if %1 < 7
    xor     eax,[ebp+%1*%2+16-%2]
    mov     [ebp+%1*%2+16],eax
    xor     eax,[ebp+%1*%2+20-%2]
    mov     [ebp+%1*%2+20],eax
%endif

%elif %2 == 32

%if %1 < 6
    push    ecx
    push    edx
    mov     edx,[ebp+%1*%2+16-%2]
    mf_call l3s_col
    pop     edx
    pop     ecx
    mov     [ebp+%1*%2+16],eax
    xor     eax,[ebp+%1*%2+20-%2]
    mov     [ebp+%1*%2+20],eax
    xor     eax,[ebp+%1*%2+24-%2]
    mov     [ebp+%1*%2+24],eax
    xor     eax,[ebp+%1*%2+28-%2]
    mov     [ebp+%1*%2+28],eax
%endif

%endif

%assign rc_val f2(rc_val)

%endmacro

%ifdef ENCRYPTION_KEY_SCHEDULE

%ifdef  AES_128

%ifndef ENCRYPTION_TABLE
; %define ENCRYPTION_TABLE
%endif

%assign rc_val  1

    do_name _aes_encrypt_key128,8

    push    ebp
    push    ebx
    push    esi
    push    edi

    mov     ebp,[esp+24]
    mov     [ebp+4*KS_LENGTH],dword 10*16
    mov     ebx,[esp+20]

    mov     esi,[ebx]
    mov     [ebp],esi
    mov     edi,[ebx+4]
    mov     [ebp+4],edi
    mov     ecx,[ebx+8]
    mov     [ebp+8],ecx
    mov     edx,[ebx+12]
    mov     [ebp+12],edx
    add     ebp,16
    mov     eax,edx

    f_key   0,16        ; 11 * 4 = 44 unsigned longs
    f_key   1,16        ; 4 + 4 * 10 generated = 44
    f_key   2,16
    f_key   3,16
    f_key   4,16
    f_key   5,16
    f_key   6,16
    f_key   7,16
    f_key   8,16
    f_key   9,16

    pop     edi
    pop     esi
    pop     ebx
    pop     ebp
    xor     eax,eax
    do_exit  8

%endif

%ifdef  AES_192

%ifndef ENCRYPTION_TABLE
; %define ENCRYPTION_TABLE
%endif

%assign rc_val  1

    do_name _aes_encrypt_key192,8

    push    ebp
    push    ebx
    push    esi
    push    edi

    mov     ebp,[esp+24]
    mov     [ebp+4*KS_LENGTH],dword 12 * 16
    mov     ebx,[esp+20]

    mov     esi,[ebx]
    mov     [ebp],esi
    mov     edi,[ebx+4]
    mov     [ebp+4],edi
    mov     ecx,[ebx+8]
    mov     [ebp+8],ecx
    mov     edx,[ebx+12]
    mov     [ebp+12],edx
    mov     eax,[ebx+16]
    mov     [ebp+16],eax
    mov     eax,[ebx+20]
    mov     [ebp+20],eax
    add     ebp,24

    f_key   0,24        ; 13 * 4 = 52 unsigned longs
    f_key   1,24        ; 6 + 6 * 8 generated = 54
    f_key   2,24
    f_key   3,24
    f_key   4,24
    f_key   5,24
    f_key   6,24
    f_key   7,24

    pop     edi
    pop     esi
    pop     ebx
    pop     ebp
    xor     eax,eax
    do_exit  8

%endif

%ifdef  AES_256

%ifndef ENCRYPTION_TABLE
; %define ENCRYPTION_TABLE
%endif

%assign rc_val  1

    do_name _aes_encrypt_key256,8

	mov		ax, sp
	movzx	esp, ax

    push    ebp
    push    ebx
    push    esi
    push    edi

    movzx   ebp, word [esp+20] ; ks
    mov     [ebp+4*KS_LENGTH],dword 14 * 16
    movzx   ebx, word [esp+18] ; key

    mov     esi,[ebx]
    mov     [ebp],esi
    mov     edi,[ebx+4]
    mov     [ebp+4],edi
    mov     ecx,[ebx+8]
    mov     [ebp+8],ecx
    mov     edx,[ebx+12]
    mov     [ebp+12],edx
    mov     eax,[ebx+16]
    mov     [ebp+16],eax
    mov     eax,[ebx+20]
    mov     [ebp+20],eax
    mov     eax,[ebx+24]
    mov     [ebp+24],eax
    mov     eax,[ebx+28]
    mov     [ebp+28],eax
    add     ebp,32

    f_key   0,32        ; 15 * 4 = 60 unsigned longs
    f_key   1,32        ; 8 + 8 * 7 generated = 64
    f_key   2,32
    f_key   3,32
    f_key   4,32
    f_key   5,32
    f_key   6,32

    pop     edi
    pop     esi
    pop     ebx
    pop     ebp
    xor     eax,eax
    do_exit  8

%endif

%ifdef  AES_VAR

%ifndef ENCRYPTION_TABLE
; %define ENCRYPTION_TABLE
%endif

    do_name _aes_encrypt_key,12

    mov     ecx,[esp+4]
    mov     eax,[esp+8]
    mov     edx,[esp+12]
    push    edx
    push    ecx

    cmp     eax,16
    je      .1
    cmp     eax,128
    je      .1

    cmp     eax,24
    je      .2
    cmp     eax,192
    je      .2

    cmp     eax,32
    je      .3
    cmp     eax,256
    je      .3
    mov     eax,-1
    add     esp,8
    do_exit 12

.1: do_call _aes_encrypt_key128,8
    do_exit 12
.2: do_call _aes_encrypt_key192,8
    do_exit 12
.3: do_call _aes_encrypt_key256,8
    do_exit 12

%endif

%endif

%ifdef ENCRYPTION_TABLE

; S-box data - 256 entries

    section _DATA

%define u8(x)   0, x, x, f3(x), f2(x), x, x, f3(x)

_aes_enc_tab:
    db  u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5)
    db  u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76)
    db  u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0)
    db  u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0)
    db  u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc)
    db  u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15)
    db  u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a)
    db  u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75)
    db  u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0)
    db  u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84)
    db  u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b)
    db  u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf)
    db  u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85)
    db  u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8)
    db  u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5)
    db  u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2)
    db  u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17)
    db  u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73)
    db  u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88)
    db  u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb)
    db  u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c)
    db  u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79)
    db  u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9)
    db  u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08)
    db  u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6)
    db  u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a)
    db  u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e)
    db  u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e)
    db  u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94)
    db  u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf)
    db  u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68)
    db  u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16)

%endif

%ifdef  DECRYPTION

; %define DECRYPTION_TABLE

%define dtab_0(x)   [_aes_dec_tab+  8*x]
%define dtab_1(x)   [_aes_dec_tab+3+8*x]
%define dtab_2(x)   [_aes_dec_tab+2+8*x]
%define dtab_3(x)   [_aes_dec_tab+1+8*x]
%define dtab_x(x)   byte [_aes_dec_tab+7+8*x]

%macro irn_fun 2

    rol eax,16
    %1      esi, cl, 0, ebp
    %1      esi, bh, 1, ebp
    %1      esi, al, 2, ebp
    %1      edi, dl, 0, ebp
    %1      edi, ch, 1, ebp
    %1      edi, ah, 3, ebp
    %2      ebp, bl, 0, ebp
    shr     eax,16
    and     ebx,0xffff0000
    or      ebx,eax
    shr     ecx,16
    %1      ebp, bh, 1, eax
    %1      ebp, ch, 3, eax
    %2      eax, cl, 2, ecx
    %1      eax, bl, 0, ecx
    %1      eax, dh, 1, ecx
    shr     ebx,16
    shr     edx,16
    %1      esi, dh, 3, ecx
    %1      ebp, dl, 2, ecx
    %1      eax, bh, 3, ecx
    %1      edi, bl, 2, ecx

%endmacro

; Basic MOV and XOR Operations for normal rounds

%macro  ni_xor  4
    movzx   %4,%2
    xor     %1,dtab_%3(%4)
%endmacro

%macro  ni_mov  4
    movzx   %4,%2
    mov     %1,dtab_%3(%4)
%endmacro

; Basic MOV and XOR Operations for last round

%macro  li_xor  4
    movzx   %4,%2
    movzx   %4,dtab_x(%4)
%if %3 != 0
    shl     %4,8*%3
%endif
    xor     %1,%4
%endmacro

%macro  li_mov  4
    movzx   %4,%2
    movzx   %1,dtab_x(%4)
%if %3 != 0
    shl     %1,8*%3
%endif
%endmacro

%ifdef REDUCE_CODE_SIZE

dec_round:
	sub		sp, 2
%ifdef AES_REV_DKS
    add     ebp,16
%else
    sub     ebp,16
%endif
    save    1,ebp
    mov     esi,[ebp+8]
    mov     edi,[ebp+12]

    irn_fun ni_xor, ni_mov

    mov     ebx,ebp
    mov     ecx,esi
    mov     edx,edi
    restore ebp,1
    xor     eax,[ebp]
    xor     ebx,[ebp+4]
    add		sp, 2
    ret

%else

%macro dec_round 0

%ifdef AES_REV_DKS
    add     ebp,16
%else
    sub     ebp,16
%endif
    save    0,ebp
    mov     esi,[ebp+8]
    mov     edi,[ebp+12]

    irn_fun ni_xor, ni_mov

    mov     ebx,ebp
    mov     ecx,esi
    mov     edx,edi
    restore ebp,0
    xor     eax,[ebp]
    xor     ebx,[ebp+4]

%endmacro

%endif

%macro dec_last_round 0

%ifdef AES_REV_DKS
    add     ebp,16
%else
    sub     ebp,16
%endif
    save    0,ebp
    mov     esi,[ebp+8]
    mov     edi,[ebp+12]

    irn_fun li_xor, li_mov

    mov     ebx,ebp
    restore ebp,0
    xor     eax,[ebp]
    xor     ebx,[ebp+4]

%endmacro

    section _TEXT

; AES Decryption Subroutine

    do_name _aes_decrypt,12

	mov		ax, sp
	movzx	esp, ax

    sub     esp,stk_spc
    mov     [esp+16],ebp
    mov     [esp+12],ebx
    mov     [esp+ 8],esi
    mov     [esp+ 4],edi

; input four columns and xor in first round key

    movzx   esi,word [esp+in_blk+stk_spc] ; input pointer
    mov     eax,[esi   ]
    mov     ebx,[esi+ 4]
    mov     ecx,[esi+ 8]
    mov     edx,[esi+12]
    lea     esi,[esi+16]

    movzx   ebp, word [esp+ctx+stk_spc]    ; key pointer
    movzx   edi,byte[ebp+4*KS_LENGTH]
%ifndef  AES_REV_DKS        ; if decryption key schedule is not reversed
    lea     ebp,[ebp+edi] ; we have to access it from the top down
%endif
    xor     eax,[ebp   ]  ; key schedule
    xor     ebx,[ebp+ 4]
    xor     ecx,[ebp+ 8]
    xor     edx,[ebp+12]

; determine the number of rounds

%ifndef AES_256
    cmp     edi,10*16
    je      .3
    cmp     edi,12*16
    je      .2
    cmp     edi,14*16
    je      .1
    mov     eax,-1
    jmp     .5
%endif

.1: mf_call dec_round
    mf_call dec_round
.2: mf_call dec_round
    mf_call dec_round
.3: mf_call dec_round
    mf_call dec_round
    mf_call dec_round
    mf_call dec_round
    mf_call dec_round
    mf_call dec_round
    mf_call dec_round
    mf_call dec_round
    mf_call dec_round
    dec_last_round

; move final values to the output array.

    movzx   ebp,word [esp+out_blk+stk_spc]
    mov     [ebp],eax
    mov     [ebp+4],ebx
    mov     [ebp+8],esi
    mov     [ebp+12],edi
    xor     eax,eax

.5: mov     ebp,[esp+16]
    mov     ebx,[esp+12]
    mov     esi,[esp+ 8]
    mov     edi,[esp+ 4]
    add     esp,stk_spc
    do_exit 12

%endif

%ifdef REDUCE_CODE_SIZE

inv_mix_col:
    movzx   ecx,dl          ; input  eax, edx
    movzx   ecx,etab_b(ecx) ; output eax
    mov     eax,dtab_0(ecx) ; used   ecx
    movzx   ecx,dh
    shr     edx,16
    movzx   ecx,etab_b(ecx)
    xor     eax,dtab_1(ecx)
    movzx   ecx,dl
    movzx   ecx,etab_b(ecx)
    xor     eax,dtab_2(ecx)
    movzx   ecx,dh
    movzx   ecx,etab_b(ecx)
    xor     eax,dtab_3(ecx)
    ret

%else

%macro  inv_mix_col 0

    movzx   ecx,dl          ; input  eax, edx
    movzx   ecx,etab_b(ecx) ; output eax
    mov     eax,dtab_0(ecx) ; used   ecx
    movzx   ecx,dh
    shr     edx,16
    movzx   ecx,etab_b(ecx)
    xor     eax,dtab_1(ecx)
    movzx   ecx,dl
    movzx   ecx,etab_b(ecx)
    xor     eax,dtab_2(ecx)
    movzx   ecx,dh
    movzx   ecx,etab_b(ecx)
    xor     eax,dtab_3(ecx)

%endmacro

%endif

%ifdef DECRYPTION_KEY_SCHEDULE

%ifdef AES_128

%ifndef DECRYPTION_TABLE
; %define DECRYPTION_TABLE
%endif

    do_name _aes_decrypt_key128,8

    push    ebp
    push    ebx
    push    esi
    push    edi
    mov     eax,[esp+24]    ; context
    mov     edx,[esp+20]    ; key
    push    eax
    push    edx
    do_call _aes_encrypt_key128,8   ; generate expanded encryption key
    mov     eax,10*16
    mov     esi,[esp+24]    ; pointer to first round key
    lea     edi,[esi+eax]   ; pointer to last round key
    add     esi,32
                            ; the inverse mix column transformation
    mov     edx,[esi-16]    ; needs to be applied to all round keys
    mf_call inv_mix_col     ; except first and last. Hence start by
    mov     [esi-16],eax    ; transforming the four sub-keys in the
    mov     edx,[esi-12]    ; second round key
    mf_call inv_mix_col
    mov     [esi-12],eax    ; transformations for subsequent rounds
    mov     edx,[esi-8]     ; can then be made more efficient by
    mf_call inv_mix_col     ; noting that for three of the four sub-keys
    mov     [esi-8],eax     ; in the encryption round key ek[r]:
    mov     edx,[esi-4]     ;
    mf_call inv_mix_col     ;   ek[r][n] = ek[r][n-1] ^ ek[r-1][n]
    mov     [esi-4],eax     ;
                            ; where n is 1..3. Hence the corresponding
.0: mov     edx,[esi]       ; subkeys in the decryption round key dk[r]
    mf_call inv_mix_col     ; also obey since inv_mix_col is linear in
    mov     [esi],eax       ; GF(256):
    xor     eax,[esi-12]    ;
    mov     [esi+4],eax     ;   dk[r][n] = dk[r][n-1] ^ dk[r-1][n]
    xor     eax,[esi-8]     ;
    mov     [esi+8],eax     ; So we only need one inverse mix column
    xor     eax,[esi-4]     ; operation (n = 0) for each four word cycle
    mov     [esi+12],eax    ; in the expanded key.
    add     esi,16
    cmp     edi,esi
    jg      .0
    jmp     dec_end

%endif

%ifdef AES_192

%ifndef DECRYPTION_TABLE
; %define DECRYPTION_TABLE
%endif

    do_name _aes_decrypt_key192,8

    push    ebp
    push    ebx
    push    esi
    push    edi
    mov     eax,[esp+24]    ; context
    mov     edx,[esp+20]    ; key
    push    eax
    push    edx
    do_call _aes_encrypt_key192,8   ; generate expanded encryption key
    mov     eax,12*16
    mov     esi,[esp+24]    ; first round key
    lea     edi,[esi+eax]   ; last round key
    add     esi,48          ; the first 6 words are the key, of
                            ; which the top 2 words are part of
    mov     edx,[esi-32]    ; the second round key and hence
    mf_call inv_mix_col     ; need to be modified. After this we
    mov     [esi-32],eax    ; need to do a further six values prior
    mov     edx,[esi-28]    ; to using a more efficient technique
    mf_call inv_mix_col     ; based on:
    mov     [esi-28],eax    ;
                            ; dk[r][n] = dk[r][n-1] ^ dk[r-1][n]
    mov     edx,[esi-24]    ;
    mf_call inv_mix_col     ; for n = 1 .. 5 where the key expansion
    mov     [esi-24],eax    ; cycle is now 6 words long
    mov     edx,[esi-20]
    mf_call inv_mix_col
    mov     [esi-20],eax
    mov     edx,[esi-16]
    mf_call inv_mix_col
    mov     [esi-16],eax
    mov     edx,[esi-12]
    mf_call inv_mix_col
    mov     [esi-12],eax
    mov     edx,[esi-8]
    mf_call inv_mix_col
    mov     [esi-8],eax
    mov     edx,[esi-4]
    mf_call inv_mix_col
    mov     [esi-4],eax

.0: mov     edx,[esi]       ; the expanded key is 13 * 4 = 44 32-bit words
    mf_call inv_mix_col     ; of which 11 * 4 = 44 have to be modified
    mov     [esi],eax       ; using inv_mix_col.  We have already done 8
    xor     eax,[esi-20]    ; of these so 36 are left - hence we need
    mov     [esi+4],eax     ; exactly 6 loops of six here
    xor     eax,[esi-16]
    mov     [esi+8],eax
    xor     eax,[esi-12]
    mov     [esi+12],eax
    xor     eax,[esi-8]
    mov     [esi+16],eax
    xor     eax,[esi-4]
    mov     [esi+20],eax
    add     esi,24
    cmp     edi,esi
    jg      .0
    jmp     dec_end

%endif

%ifdef AES_256

%ifndef DECRYPTION_TABLE
; %define DECRYPTION_TABLE
%endif

    do_name _aes_decrypt_key256,8

    mov		ax, sp
	movzx	esp, ax
    push    ebp
    push    ebx
    push    esi
    push    edi

    movzx   eax, word [esp+20] ; ks
    movzx   edx, word [esp+18] ; key
    push    ax
    push    dx
    do_call _aes_encrypt_key256,4   ; generate expanded encryption key
    mov     eax,14*16
    movzx   esi, word [esp+20] ; ks
    lea     edi,[esi+eax]
    add     esi,64

    mov     edx,[esi-48]    ; the primary key is 8 words, of which
    mf_call inv_mix_col     ; the top four require modification
    mov     [esi-48],eax
    mov     edx,[esi-44]
    mf_call inv_mix_col
    mov     [esi-44],eax
    mov     edx,[esi-40]
    mf_call inv_mix_col
    mov     [esi-40],eax
    mov     edx,[esi-36]
    mf_call inv_mix_col
    mov     [esi-36],eax

    mov     edx,[esi-32]    ; the encryption key expansion cycle is
    mf_call inv_mix_col     ; now eight words long so we need to
    mov     [esi-32],eax    ; start by doing one complete block
    mov     edx,[esi-28]
    mf_call inv_mix_col
    mov     [esi-28],eax
    mov     edx,[esi-24]
    mf_call inv_mix_col
    mov     [esi-24],eax
    mov     edx,[esi-20]
    mf_call inv_mix_col
    mov     [esi-20],eax
    mov     edx,[esi-16]
    mf_call inv_mix_col
    mov     [esi-16],eax
    mov     edx,[esi-12]
    mf_call inv_mix_col
    mov     [esi-12],eax
    mov     edx,[esi-8]
    mf_call inv_mix_col
    mov     [esi-8],eax
    mov     edx,[esi-4]
    mf_call inv_mix_col
    mov     [esi-4],eax

.0: mov     edx,[esi]       ; we can now speed up the remaining
    mf_call inv_mix_col     ; rounds by using the technique
    mov     [esi],eax       ; outlined earlier.  But note that
    xor     eax,[esi-28]    ; there is one extra inverse mix
    mov     [esi+4],eax     ; column operation as the 256 bit
    xor     eax,[esi-24]    ; key has an extra non-linear step
    mov     [esi+8],eax     ; for the midway element.
    xor     eax,[esi-20]
    mov     [esi+12],eax    ; the expanded key is 15 * 4 = 60
    mov     edx,[esi+16]    ; 32-bit words of which 52 need to
    mf_call inv_mix_col     ; be modified.  We have already done
    mov     [esi+16],eax    ; 12 so 40 are left - which means
    xor     eax,[esi-12]    ; that we need exactly 5 loops of 8
    mov     [esi+20],eax
    xor     eax,[esi-8]
    mov     [esi+24],eax
    xor     eax,[esi-4]
    mov     [esi+28],eax
    add     esi,32
    cmp     edi,esi
    jg      .0

%endif

dec_end:

%ifdef AES_REV_DKS

    movzx   esi,word [esp+20]	; this reverses the order of the
.1: mov     eax,[esi]			; round keys if required
    mov     ebx,[esi+4]
    mov     ebp,[edi]
    mov     edx,[edi+4]
    mov     [esi],ebp
    mov     [esi+4],edx
    mov     [edi],eax
    mov     [edi+4],ebx

    mov     eax,[esi+8]
    mov     ebx,[esi+12]
    mov     ebp,[edi+8]
    mov     edx,[edi+12]
    mov     [esi+8],ebp
    mov     [esi+12],edx
    mov     [edi+8],eax
    mov     [edi+12],ebx

    add     esi,16
    sub     edi,16
    cmp     edi,esi
    jg      .1

%endif

    pop     edi
    pop     esi
    pop     ebx
    pop     ebp
    xor     eax,eax
    do_exit  8

%ifdef AES_VAR

    do_name _aes_decrypt_key,12

    mov     ecx,[esp+4]
    mov     eax,[esp+8]
    mov     edx,[esp+12]
    push    edx
    push    ecx

    cmp     eax,16
    je      .1
    cmp     eax,128
    je      .1

    cmp     eax,24
    je      .2
    cmp     eax,192
    je      .2

    cmp     eax,32
    je      .3
    cmp     eax,256
    je      .3
    mov     eax,-1
    add     esp,8
    do_exit 12

.1: do_call _aes_decrypt_key128,8
    do_exit 12
.2: do_call _aes_decrypt_key192,8
    do_exit 12
.3: do_call _aes_decrypt_key256,8
    do_exit 12

%endif

%endif

%ifdef DECRYPTION_TABLE

; Inverse S-box data - 256 entries

    section _DATA

%define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x

_aes_dec_tab:
    db  v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38)
    db  v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb)
    db  v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87)
    db  v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb)
    db  v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d)
    db  v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e)
    db  v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2)
    db  v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25)
    db  v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16)
    db  v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92)
    db  v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda)
    db  v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84)
    db  v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a)
    db  v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06)
    db  v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02)
    db  v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b)
    db  v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea)
    db  v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73)
    db  v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85)
    db  v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e)
    db  v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89)
    db  v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b)
    db  v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20)
    db  v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4)
    db  v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31)
    db  v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f)
    db  v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d)
    db  v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef)
    db  v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0)
    db  v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61)
    db  v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26)
    db  v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d)

%endif