VeraCrypt
aboutsummaryrefslogtreecommitdiff
path: root/src/Crypto/Aes_x64.asm
blob: f74d0328adc58fb41501062d14fdf86ac50e550d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
.highlight .hll { background-color: #ffffcc }
.highlight .c { color: #888888 } /* Comment */
.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */
.highlight .k { color: #008800; font-weight: bold } /* Keyword */
.highlight .ch { color: #888888 } /* Comment.Hashbang */
.highlight .cm { color: #888888 } /* Comment.Multiline */
.highlight .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */
.highlight .cpf { color: #888888 } /* Comment.PreprocFile */
.highlight .c1 { color: #888888 } /* Comment.Single */
.highlight .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */
.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
.highlight .ge { font-style: italic } /* Generic.Emph */
.highlight .gr { color: #aa0000 } /* Generic.Error */
.highlight .gh { color: #333333 } /* Generic.Heading */
.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #555555 } /* Generic.Prompt */
.highlight .gs { font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #666666 } /* Generic.Subheading */
.highlight .gt { color: #aa0000 } /* Generic.Traceback */
.highlight .kc { color: #008800; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #008800 } /* Keyword.Pseudo */
.highlight .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #888888; font-weight: bold } /* Keyword.Type */
.highlight .m { color: #0000DD; font-weight: bold } /* Literal.Number */
.highlight .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */
.highlight .na { color: #336699 } /* Name.Attribute */
.highlight .nb { color: #003388 } /* Name.Builtin */
.highlight .nc { color: #bb0066; font-weight: bold } /* Name.Class */
.highlight .no { color: #003366; font-weight: bold } /* Name.Constant */
.highlight .nd { color: #555555 } /* Name.Decorator */
.highlight .ne { color: #bb0066; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #0066bb; font-weight: bold } /* Name.Function */
.highlight .nl { color: #336699; font-style: italic } /* Name.Label */
.highlight .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */
.highlight .py { color: #336699; font-weight: bold } /* Name.Property */
.highlight .nt { color: #bb0066; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #336699 } /* Name.Variable */
.highlight .ow { color: #008800 } /* Operator.Word */
.highlight .w { color: #bbbbbb } /* Text.Whitespace */
.highlight .mb { color: #0000DD; font-weight: bold } /* Literal.Number.Bin */
.highlight .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */
.highlight .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */
.highlight .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */
.highlight .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */
.highlight .sa { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Affix */
.highlight .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */
.highlight .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */
.highlight .dl { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Delimiter */
.highlight .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */
.highlight .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */
.highlight .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */
.highlight .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */
.highlight .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */
.highlight .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */
.highlight .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */
.highlight .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */
.highlight .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */
.highlight .bp { color: #003388 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #0066bb; font-weight: bold } /* Name.Function.Magic */
.highlight .vc { color: #336699 } /* Name.Variable.Class */
.highlight .vg { color: #dd7700 } /* Name.Variable.Global */
.highlight .vi { color: #3333bb } /* Name.Variable.Instance */
.highlight .vm { color: #336699 } /* Name.Variable.Magic */
.highlight .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */
This archive contains the source code of VeraCrypt.
It is based on original TrueCrypt 7.1a with security enhancements and modifications.


Important
=========

You may use the source code contained in this archive only if you accept and
agree to the license terms contained in the file 'License.txt', which is
included in this archive.

Note that the license specifies, for example, that a derived work must not be
called 'TrueCrypt' or 'VeraCrypt'



Contents
========

I. Windows  
   Requirements for Building VeraCrypt for Windows  
   Instructions for Building VeraCrypt for Windows  
	Instructions for Signing and Packaging VeraCrypt for Windows

II. Linux and Mac OS X  
    Requirements for Building VeraCrypt for Linux and Mac OS X  
    Instructions for Building VeraCrypt for Linux and Mac OS X  
	Mac OS X specifics
	
III. FreeBSD and OpenSolaris

IV. Third-Party Developers (Contributors)

V. Legal Information

VI. Further Information



I. Windows
==========

Requirements for Building VeraCrypt for Windows:
------------------------------------------------

- Microsoft Visual C++ 2008 SP1 (Professional Edition or compatible)
- Microsoft Visual C++ 1.52 (available from MSDN Subscriber Downloads)
- Microsoft Windows SDK for Windows 7 (configured for Visual C++)
- Microsoft Windows SDK for Windows 8.1 (needed for SHA-256 code signing)
- Microsoft Windows Driver Kit 7.1.0 (build 7600.16385.1)
- RSA Security Inc. PKCS #11 Cryptographic Token Interface (Cryptoki) 2.20
  header files (available at ftp://ftp.rsasecurity.com/pub/pkcs/pkcs-11/v2-20)
- NASM assembler 2.08 or compatible
- gzip compressor
- upx packer (available at http://upx.sourceforge.net/)

IMPORTANT:

The 64-bit editions of Windows Vista and later versions of Windows, and in
some cases (e.g. playback of HD DVD content) also the 32-bit editions, do not
allow the VeraCrypt driver to run without an appropriate digital signature.
Therefore, all .sys files in official VeraCrypt binary packages are digitally
signed with the digital certificate of the IDRIX, which was
issued by Thawte certification authority. At the end of each official .exe and
.sys file, there are embedded digital signatures and all related certificates
(i.e. all certificates in the relevant certification chain, such as the
certification authority certificates, CA-MS cross-certificate, and the
IDRIX certificate).  
Keep this in mind if you compile VeraCrypt
and compare your binaries with the official binaries. If your binaries are
unsigned, the sizes of the official binaries will usually be approximately
10 KB greater than sizes of your binaries (there may be further differences
if you use a different version of the compiler, or if you install a different
or no service pack for Visual Studio, or different hotfixes for it, or if you
use different versions of the required SDKs).


Instructions for Building VeraCrypt for Windows:
------------------------------------------------

1) Create an environment variable 'MSVC16_ROOT' pointing to the folder 'MSVC15'
   extracted from the Visual C++ 1.52 self-extracting package.

   Note: The 16-bit installer MSVC15\SETUP.EXE cannot be run on 64-bit Windows,
   but it is actually not necessary to run it. You only need to extract the
   folder 'MSVC15', which contains the 32-bit binaries required to build the
   VeraCrypt Boot Loader.

2) If you have installed the Windows Driver Development Kit in another
   directory than '%SYSTEMDRIVE%\WinDDK', create an environment variable
   'WINDDK_ROOT' pointing to the DDK installation directory.

3) Copy the PKCS #11 header files to a standard include path or create an
   environment variable 'PKCS11_INC' pointing to the directory where
   the PKCS #11 header files are installed.

4) Open the solution file 'VeraCrypt.sln' in Microsoft Visual Studio 2008.

5) Select 'All' as the active solution configuration.

6) Build the solution.

7) If successful, there should be newly built VeraCrypt binaries in the
   'Release' folder.

Instructions for Signing and Packaging VeraCrypt for Windows:
-------------------------------------------------------------

First, create an environment variable 'WSDK81' pointing to the Windows SDK
for Windows 8.1 installation directory.
The folder "Signing" contains a batch file (sign.bat) that will sign all 
VeraCrypt components using a code signing certificate present on the 
certificate store and also build the final installation setup.
The batch file suppose that the code signing certificate is issued by Thawt.
This is the case for IDRIX's certificate. If yours is issued by another CA, 
then you should put the Root and Intermediate certificates in the "Signing" 
folder and then modify sign.bat accordingly.


II. Linux and Mac OS X
======================

Requirements for Building VeraCrypt for Linux and Mac OS X:
-----------------------------------------------------------

- GNU Make
- GNU C++ Compiler 4.0 or compatible
- Apple Xcode (Mac OS X only)
- NASM assembler 2.08 or compatible (x86/x64 architecture only)
- pkg-config
- makeself (Linux only)
- wxWidgets 3.0 shared library and header files installed or
  wxWidgets 3.0 library source code (available at http://www.wxwidgets.org)
- FUSE library and header files (available at http://fuse.sourceforge.net
  and https://osxfuse.github.io/)
- RSA Security Inc. PKCS #11 Cryptographic Token Interface (Cryptoki) 2.20
  header files (available at ftp://ftp.rsasecurity.com/pub/pkcs/pkcs-11/v2-20).
  They are already included in the source tree under the directory PKCS11 but 
  it is possible to override it using the environment variable 'PKCS11_INC'. 


Instructions for Building VeraCrypt for Linux and Mac OS X:
-----------------------------------------------------------

1) Change the current directory to the root of the VeraCrypt source code.

2) If you have no wxWidgets shared library installed, run the following
   command to configure the wxWidgets static library for VeraCrypt and to
   build it: 

   $ make WXSTATIC=1 WX_ROOT=/usr/src/wxWidgets wxbuild

   The variable WX_ROOT must point to the location of the source code of the
   wxWidgets library. Output files will be placed in the './wxrelease/'
   directory.

3) To build VeraCrypt, run the following command:

   $ make

   or if you have no wxWidgets shared library installed:
   
   $ make WXSTATIC=1

4) If successful, the VeraCrypt executable should be located in the directory
   'Main'.

By default, a universal executable supporting both graphical and text user
interface (through the switch --text) is built.
On Linux, a console-only executable, which requires no GUI library, can be
built using the 'NOGUI' parameter:

   $ make NOGUI=1 WXSTATIC=1 WX_ROOT=/usr/src/wxWidgets wxbuild
   $ make NOGUI=1 WXSTATIC=1
   
On MacOSX, building a console-only executable is not supported.

Mac OS X specifics:
-----------------------------------------------------------
   
Under MacOSX, the SDK for OSX 10.7 is used by default. To use another version
of the SDK (i.e. 10.6), you can export the environment variable VC_OSX_TARGET:
	
	$ export VC_OSX_TARGET=10.6


Before building under MacOSX, pkg-config must be installed if not yet available.
Get it from http://pkgconfig.freedesktop.org/releases/pkg-config-0.28.tar.gz and
compile using the following commands : 

	$ ./configure --with-internal-glib
	$ make
	$ sudo make install

After making sure pkg-config is available, download and install OSXFuse from
https://osxfuse.github.io/ (MacFUSE compatibility layer must selected)

The script build_veracrypt_macosx.sh available under "src/Build" performs the 
full build of VeraCrypt including the creation of the installer pkg. It expects
to find the wxWidgets 3.0.2 sources at the same level as where you put 
VeraCrypt sources (i.e. if "src" path is "/Users/joe/Projects/VeraCrypt/src"
then wxWidgets should be at "/Users/joe/Projects/wxWidgets-wxWidgets-3.0.2")

The build process uses Code Signing certificates whose ID is specified in
src/Main/Main.make (lines 167 & 169). You'll have to modify these lines to put
the ID of your Code Signing certificates or comment them if you don't have one.

Because of incompatibility issues with OSXFUSE, the SDK 10.9 generates a
VeraCrypt binary that has issues communicating with the OSXFUSE kernel extension.
Thus, we recommend to use the SDK 10.8 or earlier for building VeraCrypt.



III. FreeBSD and OpenSolaris
============================

FreeBSD and OpenSolaris are not yet supported.



IV. Third-Party Developers (Contributors)
=========================================

If you intend to implement a feature, please contact us first to make sure:

1) That the feature has not been implemented (we may have already implemented
   it, but haven't released the code yet).  
2) That the feature is acceptable.  
3) Whether we need help of third-party developers with implementing the feature.

Information on how to contact us can be found at:
https://veracrypt.codeplex.com/



V. Legal Information
====================

Copyright Information
---------------------

This software as a whole:
Copyright (c) 2013-2015 IDRIX. All rights reserved.

Portions of this software:
Copyright (c) 2003-2012 TrueCrypt Developers Association. All rights reserved.
Copyright (c) 1998-2000 Paul Le Roux. All rights reserved.
Copyright (c) 1998-2008 Brian Gladman, Worcester, UK. All rights reserved.
Copyright (c) 2002-2004 Mark Adler. All rights reserved.
For more information, please see the legal notices attached to parts of the
source code.

Trademark Information
---------------------

Any trademarks contained in the source code, binaries, and/or in the 
documentation, are the sole property of their respective owners.



VI. Further Information
=======================

http://www.veracrypt.fr
d='n713' href='#n713'>713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
; ---------------------------------------------------------------------------
; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
;
; LICENSE TERMS
;
; The free distribution and use of this software is allowed (with or without
; changes) provided that:
;
;  1. source code distributions include the above copyright notice, this
;     list of conditions and the following disclaimer;
;
;  2. binary distributions include the above copyright notice, this list
;     of conditions and the following disclaimer in their documentation;
;
;  3. the name of the copyright holder is not used to endorse products
;     built using this software without specific written permission.
;
; DISCLAIMER
;
; This software is provided 'as is' with no explicit or implied warranties
; in respect of its properties, including, but not limited to, correctness
; and/or fitness for purpose.
; ---------------------------------------------------------------------------
; Issue 20/12/2007
;
; I am grateful to Dag Arne Osvik for many discussions of the techniques that
; can be used to optimise AES assembler code on AMD64/EM64T architectures.
; Some of the techniques used in this implementation are the result of
; suggestions made by him for which I am most grateful.

;
; Adapted for TrueCrypt:
; - Compatibility with NASM
;

; An AES implementation for AMD64 processors using the YASM assembler.  This
; implemetation provides only encryption, decryption and hence requires key
; scheduling support in C. It uses 8k bytes of tables but its encryption and
; decryption performance is very close to that obtained using large tables.
; It can use either Windows or Gnu/Linux calling conventions, which are as
; follows:
;               windows  gnu/linux
;
;   in_blk          rcx     rdi
;   out_blk         rdx     rsi
;   context (cx)     r8     rdx
;
;   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
;   registers       rdi      -      on both
;
;   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
;   registers        -      rdi     on both
;
; The default convention is that for windows, the gnu/linux convention being
; used if __GNUC__ is defined.
;
; Define _SEH_ to include support for Win64 structured exception handling
; (this requires YASM version 0.6 or later).
;
; This code provides the standard AES block size (128 bits, 16 bytes) and the
; three standard AES key sizes (128, 192 and 256 bits). It has the same call
; interface as my C implementation.  It uses the Microsoft C AMD64 calling
; conventions in which the three parameters are placed in  rcx, rdx and r8
; respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
;
;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
;
;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
;
;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
;                                            const aes_encrypt_ctx cx[1]);
;
;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
;                                            const aes_decrypt_ctx cx[1]);
;
;     AES_RETURN aes_encrypt_key(const unsigned char key[],
;                           unsigned int len, const aes_decrypt_ctx cx[1]);
;
;     AES_RETURN aes_decrypt_key(const unsigned char key[],
;                           unsigned int len, const aes_decrypt_ctx cx[1]);
;
; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
; either bits or bytes.
;
; Comment in/out the following lines to obtain the desired subroutines. These
; selections MUST match those in the C header file aes.h

; %define AES_128                 ; define if AES with 128 bit keys is needed
; %define AES_192                 ; define if AES with 192 bit keys is needed
%define AES_256                 ; define if AES with 256 bit keys is needed
; %define AES_VAR                 ; define if a variable key size is needed
%define ENCRYPTION              ; define if encryption is needed
%define DECRYPTION              ; define if decryption is needed
%define AES_REV_DKS             ; define if key decryption schedule is reversed
%define LAST_ROUND_TABLES       ; define for the faster version using extra tables

; The encryption key schedule has the following in memory layout where N is the
; number of rounds (10, 12 or 14):
;
; lo: | input key (round 0)  |  ; each round is four 32-bit words
;     | encryption round 1   |
;     | encryption round 2   |
;     ....
;     | encryption round N-1 |
; hi: | encryption round N   |
;
; The decryption key schedule is normally set up so that it has the same
; layout as above by actually reversing the order of the encryption key
; schedule in memory (this happens when AES_REV_DKS is set):
;
; lo: | decryption round 0   | =              | encryption round N   |
;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
;     ....                       ....
;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
; hi: | decryption round N   | =              | input key (round 0)  |
;
; with rounds except the first and last modified using inv_mix_column()
; But if AES_REV_DKS is NOT set the order of keys is left as it is for
; encryption so that it has to be accessed in reverse when used for
; decryption (although the inverse mix column modifications are done)
;
; lo: | decryption round 0   | =              | input key (round 0)  |
;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
;     ....                       ....
;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
; hi: | decryption round N   | =              | encryption round N   |
;
; This layout is faster when the assembler key scheduling provided here
; is used.
;
; The DLL interface must use the _stdcall convention in which the number
; of bytes of parameter space is added after an @ to the sutine's name.
; We must also remove our parameters from the stack before return (see
; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.

;%define DLL_EXPORT

; End of user defines

%ifdef AES_VAR
%ifndef AES_128
%define AES_128
%endif
%ifndef AES_192
%define AES_192
%endif
%ifndef AES_256
%define AES_256
%endif
%endif

%ifdef AES_VAR
%define KS_LENGTH       60
%elifdef AES_256
%define KS_LENGTH       60
%elifdef AES_192
%define KS_LENGTH       52
%else
%define KS_LENGTH       44
%endif

%define     r0  rax
%define     r1  rdx
%define     r2  rcx
%define     r3  rbx
%define     r4  rsi
%define     r5  rdi
%define     r6  rbp
%define     r7  rsp

%define     raxd    eax
%define     rdxd    edx
%define     rcxd    ecx
%define     rbxd    ebx
%define     rsid    esi
%define     rdid    edi
%define     rbpd    ebp
%define     rspd    esp

%define     raxb    al
%define     rdxb    dl
%define     rcxb    cl
%define     rbxb    bl
%define     rsib    sil
%define     rdib    dil
%define     rbpb    bpl
%define     rspb    spl

%define     r0h ah
%define     r1h dh
%define     r2h ch
%define     r3h bh

%define     r0d eax
%define     r1d edx
%define     r2d ecx
%define     r3d ebx

; finite field multiplies by {02}, {04} and {08}

%define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
%define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
%define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))

; finite field multiplies required in table generation

%define f3(x)   (f2(x) ^ x)
%define f9(x)   (f8(x) ^ x)
%define fb(x)   (f8(x) ^ f2(x) ^ x)
%define fd(x)   (f8(x) ^ f4(x) ^ x)
%define fe(x)   (f8(x) ^ f4(x) ^ f2(x))

; macro for expanding S-box data

%macro enc_vals 1
    db  %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5)
    db  %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76)
    db  %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0)
    db  %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0)
    db  %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc)
    db  %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15)
    db  %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a)
    db  %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75)
    db  %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0)
    db  %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84)
    db  %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b)
    db  %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf)
    db  %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85)
    db  %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8)
    db  %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5)
    db  %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2)
    db  %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17)
    db  %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73)
    db  %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88)
    db  %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb)
    db  %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c)
    db  %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79)
    db  %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9)
    db  %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08)
    db  %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6)
    db  %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a)
    db  %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e)
    db  %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e)
    db  %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94)
    db  %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf)
    db  %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68)
    db  %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16)
%endmacro

%macro dec_vals 1
    db  %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38)
    db  %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb)
    db  %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87)
    db  %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb)
    db  %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d)
    db  %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e)
    db  %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2)
    db  %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25)
    db  %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16)
    db  %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92)
    db  %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda)
    db  %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84)
    db  %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a)
    db  %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06)
    db  %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02)
    db  %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b)
    db  %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea)
    db  %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73)
    db  %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85)
    db  %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e)
    db  %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89)
    db  %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b)
    db  %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20)
    db  %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4)
    db  %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31)
    db  %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f)
    db  %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d)
    db  %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef)
    db  %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0)
    db  %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61)
    db  %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26)
    db  %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d)
%endmacro

%define u8(x)   f2(x), x, x, f3(x), f2(x), x, x, f3(x)
%define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
%define w8(x)   x, 0, 0, 0, x, 0, 0, 0

%define tptr    rbp     ; table pointer
%define kptr    r8      ; key schedule pointer
%define fofs    128     ; adjust offset in key schedule to keep |disp| < 128
%define fk_ref(x,y) [kptr-16*x+fofs+4*y]
%ifdef  AES_REV_DKS
%define rofs    128
%define ik_ref(x,y) [kptr-16*x+rofs+4*y]
%else
%define rofs    -128
%define ik_ref(x,y) [kptr+16*x+rofs+4*y]
%endif

%define tab_0(x)   [tptr+8*x]
%define tab_1(x)   [tptr+8*x+3]
%define tab_2(x)   [tptr+8*x+2]
%define tab_3(x)   [tptr+8*x+1]
%define tab_f(x)   byte [tptr+8*x+1]
%define tab_i(x)   byte [tptr+8*x+7]
%define t_ref(x,r) tab_ %+ x(r)

%macro ff_rnd 5                 ; normal forward round
    mov     %1d, fk_ref(%5,0)
    mov     %2d, fk_ref(%5,1)
    mov     %3d, fk_ref(%5,2)
    mov     %4d, fk_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    shr     eax, 16
    xor     %1d, t_ref(0,rsi)
    xor     %4d, t_ref(1,rdi)
    movzx   esi, al
    movzx   edi, ah
    xor     %3d, t_ref(2,rsi)
    xor     %2d, t_ref(3,rdi)

    movzx   esi, bl
    movzx   edi, bh
    shr     ebx, 16
    xor     %2d, t_ref(0,rsi)
    xor     %1d, t_ref(1,rdi)
    movzx   esi, bl
    movzx   edi, bh
    xor     %4d, t_ref(2,rsi)
    xor     %3d, t_ref(3,rdi)

    movzx   esi, cl
    movzx   edi, ch
    shr     ecx, 16
    xor     %3d, t_ref(0,rsi)
    xor     %2d, t_ref(1,rdi)
    movzx   esi, cl
    movzx   edi, ch
    xor     %1d, t_ref(2,rsi)
    xor     %4d, t_ref(3,rdi)

    movzx   esi, dl
    movzx   edi, dh
    shr     edx, 16
    xor     %4d, t_ref(0,rsi)
    xor     %3d, t_ref(1,rdi)
    movzx   esi, dl
    movzx   edi, dh
    xor     %2d, t_ref(2,rsi)
    xor     %1d, t_ref(3,rdi)

    mov     eax,%1d
    mov     ebx,%2d
    mov     ecx,%3d
    mov     edx,%4d
%endmacro

%ifdef LAST_ROUND_TABLES

%macro fl_rnd 5                 ; last forward round
    add     tptr, 2048
    mov     %1d, fk_ref(%5,0)
    mov     %2d, fk_ref(%5,1)
    mov     %3d, fk_ref(%5,2)
    mov     %4d, fk_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    shr     eax, 16
    xor     %1d, t_ref(0,rsi)
    xor     %4d, t_ref(1,rdi)
    movzx   esi, al
    movzx   edi, ah
    xor     %3d, t_ref(2,rsi)
    xor     %2d, t_ref(3,rdi)

    movzx   esi, bl
    movzx   edi, bh
    shr     ebx, 16
    xor     %2d, t_ref(0,rsi)
    xor     %1d, t_ref(1,rdi)
    movzx   esi, bl
    movzx   edi, bh
    xor     %4d, t_ref(2,rsi)
    xor     %3d, t_ref(3,rdi)

    movzx   esi, cl
    movzx   edi, ch
    shr     ecx, 16
    xor     %3d, t_ref(0,rsi)
    xor     %2d, t_ref(1,rdi)
    movzx   esi, cl
    movzx   edi, ch
    xor     %1d, t_ref(2,rsi)
    xor     %4d, t_ref(3,rdi)

    movzx   esi, dl
    movzx   edi, dh
    shr     edx, 16
    xor     %4d, t_ref(0,rsi)
    xor     %3d, t_ref(1,rdi)
    movzx   esi, dl
    movzx   edi, dh
    xor     %2d, t_ref(2,rsi)
    xor     %1d, t_ref(3,rdi)
%endmacro

%else

%macro fl_rnd 5                 ; last forward round
    mov     %1d, fk_ref(%5,0)
    mov     %2d, fk_ref(%5,1)
    mov     %3d, fk_ref(%5,2)
    mov     %4d, fk_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    shr     eax, 16
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    xor     %1d, esi
    rol     edi, 8
    xor     %4d, edi
    movzx   esi, al
    movzx   edi, ah
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %3d, esi
    xor     %2d, edi

    movzx   esi, bl
    movzx   edi, bh
    shr     ebx, 16
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    xor     %2d, esi
    rol     edi, 8
    xor     %1d, edi
    movzx   esi, bl
    movzx   edi, bh
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %4d, esi
    xor     %3d, edi

    movzx   esi, cl
    movzx   edi, ch
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    shr     ecx, 16
    xor     %3d, esi
    rol     edi, 8
    xor     %2d, edi
    movzx   esi, cl
    movzx   edi, ch
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %1d, esi
    xor     %4d, edi

    movzx   esi, dl
    movzx   edi, dh
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    shr     edx, 16
    xor     %4d, esi
    rol     edi, 8
    xor     %3d, edi
    movzx   esi, dl
    movzx   edi, dh
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %2d, esi
    xor     %1d, edi
%endmacro

%endif

%macro ii_rnd 5                 ; normal inverse round
    mov     %1d, ik_ref(%5,0)
    mov     %2d, ik_ref(%5,1)
    mov     %3d, ik_ref(%5,2)
    mov     %4d, ik_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    shr     eax, 16
    xor     %1d, t_ref(0,rsi)
    xor     %2d, t_ref(1,rdi)
    movzx   esi, al
    movzx   edi, ah
    xor     %3d, t_ref(2,rsi)
    xor     %4d, t_ref(3,rdi)

    movzx   esi, bl
    movzx   edi, bh
    shr     ebx, 16
    xor     %2d, t_ref(0,rsi)
    xor     %3d, t_ref(1,rdi)
    movzx   esi, bl
    movzx   edi, bh
    xor     %4d, t_ref(2,rsi)
    xor     %1d, t_ref(3,rdi)

    movzx   esi, cl
    movzx   edi, ch
    shr     ecx, 16
    xor     %3d, t_ref(0,rsi)
    xor     %4d, t_ref(1,rdi)
    movzx   esi, cl
    movzx   edi, ch
    xor     %1d, t_ref(2,rsi)
    xor     %2d, t_ref(3,rdi)

    movzx   esi, dl
    movzx   edi, dh
    shr     edx, 16
    xor     %4d, t_ref(0,rsi)
    xor     %1d, t_ref(1,rdi)
    movzx   esi, dl
    movzx   edi, dh
    xor     %2d, t_ref(2,rsi)
    xor     %3d, t_ref(3,rdi)

    mov     eax,%1d
    mov     ebx,%2d
    mov     ecx,%3d
    mov     edx,%4d
%endmacro

%ifdef LAST_ROUND_TABLES

%macro il_rnd 5                 ; last inverse round
    add     tptr, 2048
    mov     %1d, ik_ref(%5,0)
    mov     %2d, ik_ref(%5,1)
    mov     %3d, ik_ref(%5,2)
    mov     %4d, ik_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    shr     eax, 16
    xor     %1d, t_ref(0,rsi)
    xor     %2d, t_ref(1,rdi)
    movzx   esi, al
    movzx   edi, ah
    xor     %3d, t_ref(2,rsi)
    xor     %4d, t_ref(3,rdi)

    movzx   esi, bl
    movzx   edi, bh
    shr     ebx, 16
    xor     %2d, t_ref(0,rsi)
    xor     %3d, t_ref(1,rdi)
    movzx   esi, bl
    movzx   edi, bh
    xor     %4d, t_ref(2,rsi)
    xor     %1d, t_ref(3,rdi)

    movzx   esi, cl
    movzx   edi, ch
    shr     ecx, 16
    xor     %3d, t_ref(0,rsi)
    xor     %4d, t_ref(1,rdi)
    movzx   esi, cl
    movzx   edi, ch
    xor     %1d, t_ref(2,rsi)
    xor     %2d, t_ref(3,rdi)

    movzx   esi, dl
    movzx   edi, dh
    shr     edx, 16
    xor     %4d, t_ref(0,rsi)
    xor     %1d, t_ref(1,rdi)
    movzx   esi, dl
    movzx   edi, dh
    xor     %2d, t_ref(2,rsi)
    xor     %3d, t_ref(3,rdi)
%endmacro

%else

%macro il_rnd 5                 ; last inverse round
    mov     %1d, ik_ref(%5,0)
    mov     %2d, ik_ref(%5,1)
    mov     %3d, ik_ref(%5,2)
    mov     %4d, ik_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    shr     eax, 16
    xor     %1d, esi
    rol     edi, 8
    xor     %2d, edi
    movzx   esi, al
    movzx   edi, ah
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %3d, esi
    xor     %4d, edi

    movzx   esi, bl
    movzx   edi, bh
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    shr     ebx, 16
    xor     %2d, esi
    rol     edi, 8
    xor     %3d, edi
    movzx   esi, bl
    movzx   edi, bh
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %4d, esi
    xor     %1d, edi

    movzx   esi, cl
    movzx   edi, ch
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    shr     ecx, 16
    xor     %3d, esi
    rol     edi, 8
    xor     %4d, edi
    movzx   esi, cl
    movzx   edi, ch
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %1d, esi
    xor     %2d, edi

    movzx   esi, dl
    movzx   edi, dh
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    shr     edx, 16
    xor     %4d, esi
    rol     edi, 8
    xor     %1d, edi
    movzx   esi, dl
    movzx   edi, dh
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %2d, esi
    xor     %3d, edi
%endmacro

%endif

%ifdef ENCRYPTION

    global  aes_encrypt
%ifdef DLL_EXPORT
    export  aes_encrypt
%endif

    section .data align=64
    align   64
enc_tab:
    enc_vals u8
%ifdef LAST_ROUND_TABLES
    enc_vals w8
%endif

    section .text align=16
    align   16

%ifdef _SEH_
proc_frame aes_encrypt
	alloc_stack	7*8			; 7 to align stack to 16 bytes
	save_reg	rsi,4*8
	save_reg	rdi,5*8
	save_reg	rbx,1*8
	save_reg	rbp,2*8
	save_reg	r12,3*8
end_prologue
    mov     rdi, rcx        ; input pointer
    mov     [rsp+0*8], rdx  ; output pointer
%else
	aes_encrypt:
	%ifdef __GNUC__
		sub     rsp, 4*8        ; gnu/linux binary interface
		mov     [rsp+0*8], rsi  ; output pointer
		mov     r8, rdx         ; context
	%else
		sub     rsp, 6*8        ; windows binary interface
		mov     [rsp+4*8], rsi
		mov     [rsp+5*8], rdi
		mov     rdi, rcx        ; input pointer
		mov     [rsp+0*8], rdx  ; output pointer
	%endif
		mov     [rsp+1*8], rbx  ; input pointer in rdi
		mov     [rsp+2*8], rbp  ; output pointer in [rsp]
		mov     [rsp+3*8], r12  ; context in r8
%endif

    movzx   esi, byte [kptr+4*KS_LENGTH]
    lea     tptr, [rel enc_tab]
    sub     kptr, fofs

    mov     eax, [rdi+0*4]
    mov     ebx, [rdi+1*4]
    mov     ecx, [rdi+2*4]
    mov     edx, [rdi+3*4]

    xor     eax, [kptr+fofs]
    xor     ebx, [kptr+fofs+4]
    xor     ecx, [kptr+fofs+8]
    xor     edx, [kptr+fofs+12]

    lea     kptr,[kptr+rsi]
    cmp     esi, 10*16
    je      .3
    cmp     esi, 12*16
    je      .2
    cmp     esi, 14*16
    je      .1
    mov     rax, -1
    jmp     .4

.1: ff_rnd  r9, r10, r11, r12, 13
    ff_rnd  r9, r10, r11, r12, 12
.2: ff_rnd  r9, r10, r11, r12, 11
    ff_rnd  r9, r10, r11, r12, 10
.3: ff_rnd  r9, r10, r11, r12, 9
    ff_rnd  r9, r10, r11, r12, 8
    ff_rnd  r9, r10, r11, r12, 7
    ff_rnd  r9, r10, r11, r12, 6
    ff_rnd  r9, r10, r11, r12, 5
    ff_rnd  r9, r10, r11, r12, 4
    ff_rnd  r9, r10, r11, r12, 3
    ff_rnd  r9, r10, r11, r12, 2
    ff_rnd  r9, r10, r11, r12, 1
    fl_rnd  r9, r10, r11, r12, 0

    mov     rbx, [rsp]
    mov     [rbx], r9d
    mov     [rbx+4], r10d
    mov     [rbx+8], r11d
    mov     [rbx+12], r12d
    xor     rax, rax
.4:
    mov     rbx, [rsp+1*8]
    mov     rbp, [rsp+2*8]
    mov     r12, [rsp+3*8]
%ifdef __GNUC__
    add     rsp, 4*8
    ret
%else
		mov     rsi, [rsp+4*8]
		mov     rdi, [rsp+5*8]
	%ifdef _SEH_
		add     rsp, 7*8
		ret
	endproc_frame
	%else
		add     rsp, 6*8
		ret
	%endif
%endif

%endif

%ifdef DECRYPTION

    global  aes_decrypt
%ifdef DLL_EXPORT
    export  aes_decrypt
%endif

    section .data
    align   64
dec_tab:
    dec_vals v8
%ifdef LAST_ROUND_TABLES
    dec_vals w8
%endif

    section .text
    align   16

%ifdef _SEH_
proc_frame aes_decrypt
	alloc_stack	7*8			; 7 to align stack to 16 bytes
	save_reg	rsi,4*8
	save_reg	rdi,5*8
	save_reg	rbx,1*8
	save_reg	rbp,2*8
	save_reg	r12,3*8
end_prologue
    mov     rdi, rcx        ; input pointer
    mov     [rsp+0*8], rdx  ; output pointer
%else
	aes_decrypt:
	%ifdef __GNUC__
		sub     rsp, 4*8        ; gnu/linux binary interface
		mov     [rsp+0*8], rsi  ; output pointer
		mov     r8, rdx         ; context
	%else
		sub     rsp, 6*8        ; windows binary interface
		mov     [rsp+4*8], rsi
		mov     [rsp+5*8], rdi
		mov     rdi, rcx        ; input pointer
		mov     [rsp+0*8], rdx  ; output pointer
	%endif
		mov     [rsp+1*8], rbx  ; input pointer in rdi
		mov     [rsp+2*8], rbp  ; output pointer in [rsp]
		mov     [rsp+3*8], r12  ; context in r8
%endif

    movzx   esi,byte[kptr+4*KS_LENGTH]
    lea     tptr, [rel dec_tab]
    sub     kptr, rofs

    mov     eax, [rdi+0*4]
    mov     ebx, [rdi+1*4]
    mov     ecx, [rdi+2*4]
    mov     edx, [rdi+3*4]

%ifdef      AES_REV_DKS
    mov     rdi, kptr
    lea     kptr,[kptr+rsi]
%else
    lea     rdi,[kptr+rsi]
%endif

    xor     eax, [rdi+rofs]
    xor     ebx, [rdi+rofs+4]
    xor     ecx, [rdi+rofs+8]
    xor     edx, [rdi+rofs+12]

    cmp     esi, 10*16
    je      .3
    cmp     esi, 12*16
    je      .2
    cmp     esi, 14*16
    je      .1
    mov     rax, -1
    jmp     .4

.1: ii_rnd  r9, r10, r11, r12, 13
    ii_rnd  r9, r10, r11, r12, 12
.2: ii_rnd  r9, r10, r11, r12, 11
    ii_rnd  r9, r10, r11, r12, 10
.3: ii_rnd  r9, r10, r11, r12, 9
    ii_rnd  r9, r10, r11, r12, 8
    ii_rnd  r9, r10, r11, r12, 7
    ii_rnd  r9, r10, r11, r12, 6
    ii_rnd  r9, r10, r11, r12, 5
    ii_rnd  r9, r10, r11, r12, 4
    ii_rnd  r9, r10, r11, r12, 3
    ii_rnd  r9, r10, r11, r12, 2
    ii_rnd  r9, r10, r11, r12, 1
    il_rnd  r9, r10, r11, r12, 0

    mov     rbx, [rsp]
    mov     [rbx], r9d
    mov     [rbx+4], r10d
    mov     [rbx+8], r11d
    mov     [rbx+12], r12d
    xor     rax, rax
.4: mov     rbx, [rsp+1*8]
    mov     rbp, [rsp+2*8]
    mov     r12, [rsp+3*8]
%ifdef __GNUC__
    add     rsp, 4*8
    ret
%else
		mov     rsi, [rsp+4*8]
		mov     rdi, [rsp+5*8]
	%ifdef _SEH_
		add     rsp, 7*8
		ret
	endproc_frame
	%else
		add     rsp, 6*8
		ret
	%endif
%endif

%endif